148 files changed, 8199 insertions, 5129 deletions
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index cca6d12..572617c 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -349,59 +349,6 @@ AArch64FrameLowering::resolveFrameIndexReference(MachineFunction &MF,
   return TopOfFrameOffset - FrameRegPos;
 }
 
-/// Estimate and return the size of the frame.
-static unsigned estimateStackSize(MachineFunction &MF) {
-  // FIXME: Make generic? Really consider after upstreaming.  This code is now
-  // shared between PEI, ARM *and* here.
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
-  unsigned MaxAlign = MFI->getMaxAlignment();
-  int Offset = 0;
-
-  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
-  // It really should be refactored to share code. Until then, changes
-  // should keep in mind that there's tight coupling between the two.
-
-  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -MFI->getObjectOffset(i);
-    if (FixedOff > Offset) Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-    if (MFI->isDeadObjectIndex(i))
-      continue;
-    Offset += MFI->getObjectSize(i);
-    unsigned Align = MFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset+Align-1)/Align*Align;
-
-    MaxAlign = std::max(Align, MaxAlign);
-  }
-
-  if (MFI->adjustsStack() && TFI->hasReservedCallFrame(MF))
-    Offset += MFI->getMaxCallFrameSize();
-
-  // Round up the size to a multiple of the alignment.  If the function has
-  // any calls or alloca's, align to the target's StackAlignment value to
-  // ensure that the callee's frame or the alloca data is suitably aligned;
-  // otherwise, for leaf functions, align to the TransientStackAlignment
-  // value.
-  unsigned StackAlign;
-  if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
-      (RegInfo->needsStackRealignment(MF) && MFI->getObjectIndexEnd() != 0))
-    StackAlign = TFI->getStackAlignment();
-  else
-    StackAlign = TFI->getTransientStackAlignment();
-
-  // If the frame pointer is eliminated, all frame offsets will be relative to
-  // SP not FP. Align to MaxAlign so this works.
-  StackAlign = std::max(StackAlign, MaxAlign);
-  unsigned AlignMask = StackAlign - 1;
-  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
-
-  return (unsigned)Offset;
-}
-
 void
 AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                        RegScavenger *RS) const {
@@ -422,7 +369,7 @@ AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // callee-save register for this purpose or allocate an extra spill slot.
 
   bool BigStack =
-    (RS && estimateStackSize(MF) >= TII.estimateRSStackLimit(MF))
+    (RS && MFI->estimateStackSize(MF) >= TII.estimateRSStackLimit(MF))
     || MFI->hasVarSizedObjects() // Access will be from X29: messes things up
     || (MFI->adjustsStack() && !hasReservedCallFrame(MF));
 
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index cea7f91..e9f4497 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -200,6 +200,8 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::FSIN, MVT::f32, Expand);
   setOperationAction(ISD::FSIN, MVT::f64, Expand);
 
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
@@ -217,6 +219,7 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::FREM,       MVT::f128, Expand);
   setOperationAction(ISD::FRINT,      MVT::f128, Expand);
   setOperationAction(ISD::FSIN,       MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
   setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
   setOperationAction(ISD::FSUB,       MVT::f128, Custom);
   setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
@@ -341,8 +344,7 @@ AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   //   ldxr dest, ptr
   //   <binop> scratch, dest, incr
   //   stxr stxr_status, scratch, ptr
-  //   cmp stxr_status, #0
-  //   b.ne loopMBB
+  //   cbnz stxr_status, loopMBB
   //   fallthrough --> exitMBB
   BB = loopMBB;
   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
@@ -364,10 +366,8 @@ AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 
   BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::SUBwwi_lsl0_cmp))
-    .addReg(stxr_status).addImm(0);
-  BuildMI(BB, dl, TII->get(AArch64::Bcc))
-    .addImm(A64CC::NE).addMBB(loopMBB);
+  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
+    .addReg(stxr_status).addMBB(loopMBB);
 
   BB->addSuccessor(loopMBB);
   BB->addSuccessor(exitMBB);
@@ -437,8 +437,7 @@ AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
   //   cmp incr, dest (, sign extend if necessary)
   //   csel scratch, dest, incr, cond
   //   stxr stxr_status, scratch, ptr
-  //   cmp stxr_status, #0
-  //   b.ne loopMBB
+  //   cbnz stxr_status, loopMBB
   //   fallthrough --> exitMBB
   BB = loopMBB;
   BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
@@ -457,10 +456,8 @@ AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
 
   BuildMI(BB, dl, TII->get(strOpc), stxr_status)
     .addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::SUBwwi_lsl0_cmp))
-    .addReg(stxr_status).addImm(0);
-  BuildMI(BB, dl, TII->get(AArch64::Bcc))
-    .addImm(A64CC::NE).addMBB(loopMBB);
+  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
+    .addReg(stxr_status).addMBB(loopMBB);
 
   BB->addSuccessor(loopMBB);
   BB->addSuccessor(exitMBB);
@@ -533,17 +530,14 @@ AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
 
   // loop2MBB:
   //   strex stxr_status, newval, [ptr]
-  //   cmp stxr_status, #0
-  //   b.ne loop1MBB
+  //   cbnz stxr_status, loop1MBB
   BB = loop2MBB;
   unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
   MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
 
   BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::SUBwwi_lsl0_cmp))
-    .addReg(stxr_status).addImm(0);
-  BuildMI(BB, dl, TII->get(AArch64::Bcc))
-    .addImm(A64CC::NE).addMBB(loop1MBB);
+  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
+    .addReg(stxr_status).addMBB(loop1MBB);
   BB->addSuccessor(loop1MBB);
   BB->addSuccessor(exitMBB);
 
@@ -1861,11 +1855,10 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
   const GlobalValue *GV = GN->getGlobal();
   unsigned Alignment = GV->getAlignment();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-
-  if (GV->isWeakForLinker() && RelocM == Reloc::Static) {
-    // Weak symbols can't use ADRP/ADD pair since they should evaluate to
-    // zero when undefined. In PIC mode the GOT can take care of this, but in
-    // absolute mode we use a constant pool load.
+  if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
+    // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
+    // to zero when they remain undefined. In PIC mode the GOT can take care of
+    // this, but in absolute mode we use a constant pool load.
     SDValue PoolAddr;
     PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
                            DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
@@ -1873,10 +1866,16 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
                            DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
                                                      AArch64II::MO_LO12),
                            DAG.getConstant(8, MVT::i32));
-    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
-                       MachinePointerInfo::getConstantPool(),
-                       /*isVolatile=*/ false,  /*isNonTemporal=*/ true,
-                       /*isInvariant=*/ true, 8);
+    SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
+                                     MachinePointerInfo::getConstantPool(),
+                                     /*isVolatile=*/ false,
+                                     /*isNonTemporal=*/ true,
+                                     /*isInvariant=*/ true, 8);
+    if (GN->getOffset() != 0)
+      return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
+                         DAG.getConstant(GN->getOffset(), PtrVT));
+
+    return GlobalAddr;
   }
 
   if (Alignment == 0) {
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 562a7f6..319ec97 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -159,7 +159,7 @@ let Defs = [XSP], Uses = [XSP] in {
 // Atomic operation pseudo-instructions
 //===----------------------------------------------------------------------===//
 
-let usesCustomInserter = 1, Defs = [NZCV] in {
+let usesCustomInserter = 1 in {
 multiclass AtomicSizes<string opname> {
   def _I8 : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$incr),
     [(set GPR32:$dst, (!cast<SDNode>(opname # "_8") GPR64:$ptr, GPR32:$incr))]>;
@@ -178,11 +178,14 @@ defm ATOMIC_LOAD_AND  : AtomicSizes<"atomic_load_and">;
 defm ATOMIC_LOAD_OR   : AtomicSizes<"atomic_load_or">;
 defm ATOMIC_LOAD_XOR  : AtomicSizes<"atomic_load_xor">;
 defm ATOMIC_LOAD_NAND : AtomicSizes<"atomic_load_nand">;
-defm ATOMIC_LOAD_MIN  : AtomicSizes<"atomic_load_min">;
-defm ATOMIC_LOAD_MAX  : AtomicSizes<"atomic_load_max">;
-defm ATOMIC_LOAD_UMIN : AtomicSizes<"atomic_load_umin">;
-defm ATOMIC_LOAD_UMAX : AtomicSizes<"atomic_load_umax">;
 defm ATOMIC_SWAP      : AtomicSizes<"atomic_swap">;
+let Defs = [NZCV] in {
+  // These operations need a CMP to calculate the correct value
+  defm ATOMIC_LOAD_MIN  : AtomicSizes<"atomic_load_min">;
+  defm ATOMIC_LOAD_MAX  : AtomicSizes<"atomic_load_max">;
+  defm ATOMIC_LOAD_UMIN : AtomicSizes<"atomic_load_umin">;
+  defm ATOMIC_LOAD_UMAX : AtomicSizes<"atomic_load_umax">;
+}
 
 let usesCustomInserter = 1, Defs = [NZCV] in {
 def ATOMIC_CMP_SWAP_I8
@@ -1942,43 +1945,41 @@ def fpz32 : Operand<f32>,
             ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
   let ParserMatchClass = fpzero_asmoperand;
   let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
 }
 
 def fpz64 : Operand<f64>,
             ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
   let ParserMatchClass = fpzero_asmoperand;
   let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
 }
 
-multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, string asmop2,
-                            dag pattern> {
+multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
   def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
-                          (outs), ins, !strconcat("fcmp\t$Rn, ", asmop2),
-                          [pattern], NoItinerary> {
+                          (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
+                          NoItinerary> {
     let Defs = [NZCV];
   }
 
   def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
-                        (outs), ins, !strconcat("fcmpe\t$Rn, ", asmop2),
-                        [], NoItinerary> {
+                        (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary> {
     let Defs = [NZCV];
   }
 }
 
-defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm), "$Rm",
+defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm),
                                (set NZCV, (A64cmp (f32 FPR32:$Rn), FPR32:$Rm))>;
-defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm), "$Rm",
+defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm),
                                (set NZCV, (A64cmp (f64 FPR64:$Rn), FPR64:$Rm))>;
 
-// What would be Rm should be written as 0, but anything is valid for
-// disassembly so we can't set the bits
-let PostEncoderMethod = "fixFCMPImm" in {
-  defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Imm), "$Imm",
-                              (set NZCV, (A64cmp (f32 FPR32:$Rn), fpz32:$Imm))>;
+// What would be Rm should be written as 0; note that even though it's called
+// "$Rm" here to fit in with the InstrFormats, it's actually an immediate.
+defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Rm),
+                               (set NZCV, (A64cmp (f32 FPR32:$Rn), fpz32:$Rm))>;
 
-  defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Imm), "$Imm",
-                              (set NZCV, (A64cmp (f64 FPR64:$Rn), fpz64:$Imm))>;
-}
+defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Rm),
+                               (set NZCV, (A64cmp (f64 FPR64:$Rn), fpz64:$Rm))>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index c1695da..69bb80a 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -160,44 +160,53 @@ private:
 
   SMLoc StartLoc, EndLoc;
 
+  struct ImmWithLSLOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
+    bool ImplicitAmount;
+  };
+
+  struct CondCodeOp {
+    A64CC::CondCodes Code;
+  };
+
+  struct FPImmOp {
+    double Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ShiftExtendOp {
+    A64SE::ShiftExtSpecifiers ShiftType;
+    unsigned Amount;
+    bool ImplicitAmount;
+  };
+
+  struct SysRegOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
   union {
-    struct {
-      const MCExpr *Val;
-      unsigned ShiftAmount;
-      bool ImplicitAmount;
-    } ImmWithLSL;
-
-    struct {
-      A64CC::CondCodes Code;
-    } CondCode;
-
-    struct {
-      double Val;
-    } FPImm;
-
-    struct {
-      const MCExpr *Val;
-    } Imm;
-
-    struct {
-      unsigned RegNum;
-    } Reg;
-
-    struct {
-      A64SE::ShiftExtSpecifiers ShiftType;
-      unsigned Amount;
-      bool ImplicitAmount;
-    } ShiftExtend;
-
-    struct {
-      const char *Data;
-      unsigned Length;
-    } SysReg;
-
-    struct {
-      const char *Data;
-      unsigned Length;
-    } Tok;
+    struct ImmWithLSLOp ImmWithLSL;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct ImmOp Imm;
+    struct RegOp Reg;
+    struct ShiftExtendOp ShiftExtend;
+    struct SysRegOp SysReg;
+    struct TokOp Tok;
   };
 
   AArch64Operand(KindTy K, SMLoc S, SMLoc E)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index eba7666..12c1b8f 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -106,6 +106,11 @@ static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
                                                uint64_t Address,
                                                const void *Decoder);
 
+static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
+                                        unsigned RmBits,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
                                              unsigned FullImm,
@@ -381,6 +386,17 @@ static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
+                                        unsigned RmBits,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  // Any bits are valid in the instruction (they're architecturally ignored),
+  // but a code generator should insert 0.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  return MCDisassembler::Success;
+}
+
+
 
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 756e037..a5c591e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -106,8 +106,6 @@ public:
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups) const;
 
-  unsigned fixFCMPImm(const MCInst &MI, unsigned EncodedValue) const;
-
   template<int hasRs, int hasRt2> unsigned
   fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue) const;
 
@@ -423,15 +421,6 @@ AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
   return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups);
 }
 
-unsigned AArch64MCCodeEmitter::fixFCMPImm(const MCInst &MI,
-                                          unsigned EncodedValue) const {
-    // For FCMP[E] Rn, #0.0, the Rm field has a canonical representation
-    // with 0s, but is architecturally ignored
-    EncodedValue &= ~0x1f0000u;
-
-    return EncodedValue;
-}
-
 template<int hasRs, int hasRt2> unsigned
 AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
                                             unsigned EncodedValue) const {
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
new file mode 100644
index 0000000..f0d4dbe
--- /dev/null
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -0,0 +1,704 @@
+//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The Cortex-A15 processor employs a tracking scheme in its register renaming
+// in order to process each instruction's micro-ops speculatively and
+// out-of-order with appropriate forwarding. The ARM architecture allows VFP
+// instructions to read and write 32-bit S-registers.  Each S-register
+// corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
+//
+// There are several instruction patterns which can be used to provide this
+// capability which can provide higher performance than other, potentially more
+// direct patterns, specifically around when one micro-op reads a D-register
+// operand that has recently been written as one or more S-register results.
+//
+// This file defines a pre-regalloc pass which looks for SPR producers which
+// are going to be used by a DPR (or QPR) consumers and creates the more
+// optimized access pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "a15-sd-optimizer"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMISelLowering.h"
+#include "ARMTargetMachine.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <set>
+
+using namespace llvm;
+
+namespace {
+  struct A15SDOptimizer : public MachineFunctionPass {
+    static char ID;
+    A15SDOptimizer() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM A15 S->D optimizer";
+    }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    bool runOnInstruction(MachineInstr *MI);
+
+    //
+    // Instruction builder helpers
+    //
+    unsigned createDupLane(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           DebugLoc DL,
+                           unsigned Reg, unsigned Lane,
+                           bool QPR=false);
+
+    unsigned createExtractSubreg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator InsertBefore,
+                                 DebugLoc DL,
+                                 unsigned DReg, unsigned Lane,
+                                 const TargetRegisterClass *TRC);
+
+    unsigned createVExt(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator InsertBefore,
+                        DebugLoc DL,
+                        unsigned Ssub0, unsigned Ssub1);
+
+    unsigned createRegSequence(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               DebugLoc DL,
+                               unsigned Reg1, unsigned Reg2);
+
+    unsigned createInsertSubreg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator InsertBefore,
+                                DebugLoc DL, unsigned DReg, unsigned Lane,
+                                unsigned ToInsert);
+
+    unsigned createImplicitDef(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               DebugLoc DL);
+    
+    //
+    // Various property checkers
+    //
+    bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
+    bool hasPartialWrite(MachineInstr *MI);
+    SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
+    unsigned getDPRLaneFromSPR(unsigned SReg);
+
+    //
+    // Methods used for getting the definitions of partial registers
+    //
+
+    MachineInstr *elideCopies(MachineInstr *MI);
+    void elideCopiesAndPHIs(MachineInstr *MI,
+                            SmallVectorImpl<MachineInstr*> &Outs);
+
+    //
+    // Pattern optimization methods
+    //
+    unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
+    unsigned optimizeSDPattern(MachineInstr *MI);
+    unsigned getPrefSPRLane(unsigned SReg);
+
+    //
+    // Sanitizing method - used to make sure if don't leave dead code around.
+    //
+    void eraseInstrWithNoUses(MachineInstr *MI);
+
+    //
+    // A map used to track the changes done by this pass.
+    //
+    std::map<MachineInstr*, unsigned> Replacements;
+    std::set<MachineInstr *> DeadInstr;
+  };
+  char A15SDOptimizer::ID = 0;
+} // end anonymous namespace
+
+// Returns true if this is a use of a SPR register.
+bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
+                                  const TargetRegisterClass *TRC) {
+  if (!MO.isReg())
+    return false;
+  unsigned Reg = MO.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
+  else
+    return TRC->contains(Reg);
+}
+
+unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
+                                           &ARM::DPRRegClass);
+  if (DReg != ARM::NoRegister) return ARM::ssub_1;
+  return ARM::ssub_0;
+}
+
+// Get the subreg type that is most likely to be coalesced
+// for an SPR register that will be used in VDUP32d pseudo.
+unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
+  if (!TRI->isVirtualRegister(SReg))
+    return getDPRLaneFromSPR(SReg);
+
+  MachineInstr *MI = MRI->getVRegDef(SReg);
+  if (!MI) return ARM::ssub_0;
+  MachineOperand *MO = MI->findRegisterDefOperand(SReg);
+
+  assert(MO->isReg() && "Non register operand found!");
+  if (!MO) return ARM::ssub_0;
+
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1),
+                                    &ARM::SPRRegClass)) {
+    SReg = MI->getOperand(1).getReg();
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+    if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
+    return ARM::ssub_0;
+  }
+  return getDPRLaneFromSPR(SReg);
+}
+
+// MI is known to be dead. Figure out what instructions
+// are also made dead by this and mark them for removal.
+void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
+  SmallVector<MachineInstr *, 8> Front;
+  DeadInstr.insert(MI);
+
+  DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
+  Front.push_back(MI);
+
+  while (Front.size() != 0) {
+    MI = Front.back();
+    Front.pop_back();
+
+    // MI is already known to be dead. We need to see
+    // if other instructions can also be removed.
+    for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if ((!MO.isReg()) || (!MO.isUse()))
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TRI->isVirtualRegister(Reg))
+        continue;
+      MachineOperand *Op = MI->findRegisterDefOperand(Reg);
+
+      if (!Op)
+        continue;
+
+      MachineInstr *Def = Op->getParent();
+
+      // We don't need to do anything if we have already marked
+      // this instruction as being dead.
+      if (DeadInstr.find(Def) != DeadInstr.end())
+        continue;
+
+      // Check if all the uses of this instruction are marked as
+      // dead. If so, we can also mark this instruction as being
+      // dead.
+      bool IsDead = true;
+      for (unsigned int j = 0; j < Def->getNumOperands(); ++j) {
+        MachineOperand &MODef = Def->getOperand(j);
+        if ((!MODef.isReg()) || (!MODef.isDef()))
+          continue;
+        unsigned DefReg = MODef.getReg();
+        if (!TRI->isVirtualRegister(DefReg)) {
+          IsDead = false;
+          break;
+        }
+        for (MachineRegisterInfo::use_iterator II = MRI->use_begin(Reg),
+                            EE = MRI->use_end();
+                            II != EE; ++II) {
+          // We don't care about self references.
+          if (&*II == Def)
+            continue;
+          if (DeadInstr.find(&*II) == DeadInstr.end()) {
+            IsDead = false;
+            break;
+          }
+        }
+      }
+
+      if (!IsDead) continue;
+
+      DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
+      DeadInstr.insert(Def);
+    }
+  }
+}
+
+// Creates the more optimized patterns and generally does all the code
+// transformations in this pass.
+unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
+  if (MI->isCopy()) {
+    return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
+  }
+
+  if (MI->isInsertSubreg()) {
+    unsigned DPRReg = MI->getOperand(1).getReg();
+    unsigned SPRReg = MI->getOperand(2).getReg();
+
+    if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+      MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+      MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
+
+      if (DPRMI && SPRMI) {
+        // See if the first operand of this insert_subreg is IMPLICIT_DEF
+        MachineInstr *ECDef = elideCopies(DPRMI);
+        if (ECDef != 0 && ECDef->isImplicitDef()) {
+          // Another corner case - if we're inserting something that is purely
+          // a subreg copy of a DPR, just use that DPR.
+
+          MachineInstr *EC = elideCopies(SPRMI);
+          // Is it a subreg copy of ssub_0?
+          if (EC && EC->isCopy() &&
+              EC->getOperand(1).getSubReg() == ARM::ssub_0) {
+            DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
+
+            // Find the thing we're subreg copying out of - is it of the same
+            // regclass as DPRMI? (i.e. a DPR or QPR).
+            unsigned FullReg = SPRMI->getOperand(1).getReg();
+            const TargetRegisterClass *TRC =
+              MRI->getRegClass(MI->getOperand(1).getReg());
+            if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
+              DEBUG(dbgs() << "Subreg copy is compatible - returning ");
+              DEBUG(dbgs() << PrintReg(FullReg) << "\n");
+              eraseInstrWithNoUses(MI);
+              return FullReg;
+            }
+          }
+
+          return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
+        }
+      }
+    }
+    return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
+                                          &ARM::SPRRegClass)) {
+    // See if all bar one of the operands are IMPLICIT_DEF and insert the
+    // optimizer pattern accordingly.
+    unsigned NumImplicit = 0, NumTotal = 0;
+    unsigned NonImplicitReg = ~0U;
+
+    for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
+      if (!MI->getOperand(I).isReg())
+        continue;
+      ++NumTotal;
+      unsigned OpReg = MI->getOperand(I).getReg();
+
+      if (!TRI->isVirtualRegister(OpReg))
+        break;
+
+      MachineInstr *Def = MRI->getVRegDef(OpReg);
+      if (!Def)
+        break;
+      if (Def->isImplicitDef())
+        ++NumImplicit;
+      else
+        NonImplicitReg = MI->getOperand(I).getReg();
+    }
+
+    if (NumImplicit == NumTotal - 1)
+      return optimizeAllLanesPattern(MI, NonImplicitReg);
+    else
+      return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  assert(0 && "Unhandled update pattern!");
+  return 0;
+}
+
+// Return true if this MachineInstr inserts a scalar (SPR) value into
+// a D or Q register.
+bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
+  // The only way we can do a partial register update is through a COPY,
+  // INSERT_SUBREG or REG_SEQUENCE.
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
+                                           &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  return false;
+}
+
+// Looks through full copies to get the instruction that defines the input
+// operand for MI.
+MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
+  if (!MI->isFullCopy())
+    return MI;
+  if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+    return NULL;
+  MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
+  if (!Def)
+    return NULL;
+  return elideCopies(Def);
+}
+
+// Look through full copies and PHIs to get the set of non-copy MachineInstrs
+// that can produce MI.
+void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
+                                        SmallVectorImpl<MachineInstr*> &Outs) {
+   // Looking through PHIs may create loops so we need to track what
+   // instructions we have visited before.
+   std::set<MachineInstr *> Reached;
+   SmallVector<MachineInstr *, 8> Front;
+   Front.push_back(MI);
+   while (Front.size() != 0) {
+     MI = Front.back();
+     Front.pop_back();
+
+     // If we have already explored this MachineInstr, ignore it.
+     if (Reached.find(MI) != Reached.end())
+       continue;
+     Reached.insert(MI);
+     if (MI->isPHI()) {
+       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+         unsigned Reg = MI->getOperand(I).getReg();
+         if (!TRI->isVirtualRegister(Reg)) {
+           continue;
+         }
+         MachineInstr *NewMI = MRI->getVRegDef(Reg);
+         if (!NewMI)
+           continue;
+         Front.push_back(NewMI);
+       }
+     } else if (MI->isFullCopy()) {
+       if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+         continue;
+       MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+       if (!NewMI)
+         continue;
+       Front.push_back(NewMI);
+     } else {
+       DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
+       Outs.push_back(MI);
+     }
+   }
+}
+
+// Return the DPR virtual registers that are read by this machine instruction
+// (if any).
+SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
+  if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
+      MI->isKill())
+    return SmallVector<unsigned, 8>();
+
+  SmallVector<unsigned, 8> Defs;
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    if (!usesRegClass(MO, &ARM::DPRRegClass) &&
+        !usesRegClass(MO, &ARM::QPRRegClass))
+      continue;
+
+    Defs.push_back(MO.getReg());
+  }
+  return Defs;
+}
+
+// Creates a DPR register from an SPR one by using a VDUP.
+unsigned
+A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator InsertBefore,
+                              DebugLoc DL,
+                              unsigned Reg, unsigned Lane, bool QPR) {
+  unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
+                                                  &ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
+                         Out)
+                   .addReg(Reg)
+                   .addImm(Lane));
+ 
+  return Out;
+}
+
+// Creates a SPR register from a DPR by copying the value in lane 0.
+unsigned
+A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertBefore,
+                                    DebugLoc DL,
+                                    unsigned DReg, unsigned Lane,
+                                    const TargetRegisterClass *TRC) {
+  unsigned Out = MRI->createVirtualRegister(TRC);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::COPY), Out)
+    .addReg(DReg, 0, Lane);
+
+  return Out;
+}
+
+// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
+unsigned
+A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  DebugLoc DL,
+                                  unsigned Reg1, unsigned Reg2) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::REG_SEQUENCE), Out)
+    .addReg(Reg1)
+    .addImm(ARM::dsub_0)
+    .addReg(Reg2)
+    .addImm(ARM::dsub_1);
+  return Out;
+}
+
+// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
+// and merges them into one DPR register.
+unsigned
+A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           DebugLoc DL,
+                           unsigned Ssub0, unsigned Ssub1) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(ARM::VEXTd32), Out)
+                   .addReg(Ssub0)
+                   .addReg(Ssub1)
+                   .addImm(1));
+  return Out;
+}
+
+unsigned
+A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator InsertBefore,
+                                   DebugLoc DL, unsigned DReg, unsigned Lane,
+                                   unsigned ToInsert) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::INSERT_SUBREG), Out)
+    .addReg(DReg)
+    .addReg(ToInsert)
+    .addImm(Lane);
+
+  return Out;
+}
+
+unsigned
+A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  DebugLoc DL) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::IMPLICIT_DEF), Out);
+  return Out;
+}
+
+// This function inserts instructions in order to optimize interactions between
+// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
+// lanes, and the using VEXT instructions to recompose the result.
+unsigned
+A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
+  MachineBasicBlock::iterator InsertPt(MI);
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock &MBB = *MI->getParent();
+  InsertPt++;
+  unsigned Out;
+
+  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass)) {
+    unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_0, &ARM::DPRRegClass);
+    unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_1, &ARM::DPRRegClass);
+
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+    unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
+    unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
+    Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);
+
+    Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);
+
+  } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+  } else {
+    assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
+           "Found unexpected regclass!");
+
+    unsigned PrefLane = getPrefSPRLane(Reg);
+    unsigned Lane;
+    switch (PrefLane) {
+      case ARM::ssub_0: Lane = 0; break;
+      case ARM::ssub_1: Lane = 1; break;
+      default: llvm_unreachable("Unknown preferred lane!");
+    }
+
+    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass);
+
+    Out = createImplicitDef(MBB, InsertPt, DL);
+    Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
+    Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
+    eraseInstrWithNoUses(MI);
+  }
+  return Out;
+}
+
+bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
+  // We look for instructions that write S registers that are then read as
+  // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
+  // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
+  // merge two SPR values to form a DPR register.  In order avoid false
+  // positives we make sure that there is an SPR producer so we look past
+  // COPY and PHI nodes to find it.
+  //
+  // The best code pattern for when an SPR producer is going to be used by a
+  // DPR or QPR consumer depends on whether the other lanes of the
+  // corresponding DPR/QPR are currently defined.
+  //
+  // We can handle these efficiently, depending on the type of
+  // pseudo-instruction that is producing the pattern
+  //
+  //   * COPY:          * VDUP all lanes and merge the results together
+  //                      using VEXTs.
+  //
+  //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
+  //                      lane, and the other lane(s) of the DPR/QPR register
+  //                      that we are inserting in are undefined, use the
+  //                      original DPR/QPR value. 
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+  //   * REG_SEQUENCE:  * If all except one of the input operands are
+  //                      IMPLICIT_DEFs, insert the VDUP pattern for just the
+  //                      defined input operand
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+
+  // First, get all the reads of D-registers done by this instruction.
+  SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
+  bool Modified = false;
+
+  for (SmallVector<unsigned, 8>::iterator I = Defs.begin(), E = Defs.end();
+     I != E; ++I) {
+    // Follow the def-use chain for this DPR through COPYs, and also through
+    // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
+    // we can end up with multiple defs of this DPR.
+
+    SmallVector<MachineInstr *, 8> DefSrcs;
+    if (!TRI->isVirtualRegister(*I))
+      continue;
+    MachineInstr *Def = MRI->getVRegDef(*I);
+    if (!Def)
+      continue;
+
+    elideCopiesAndPHIs(Def, DefSrcs);
+
+    for (SmallVector<MachineInstr*, 8>::iterator II = DefSrcs.begin(),
+      EE = DefSrcs.end(); II != EE; ++II) {
+      MachineInstr *MI = *II;
+
+      // If we've already analyzed and replaced this operand, don't do
+      // anything.
+      if (Replacements.find(MI) != Replacements.end())
+        continue;
+
+      // Now, work out if the instruction causes a SPR->DPR dependency.
+      if (!hasPartialWrite(MI))
+        continue;
+
+      // Collect all the uses of this MI's DPR def for updating later.
+      SmallVector<MachineOperand*, 8> Uses;
+      unsigned DPRDefReg = MI->getOperand(0).getReg();
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
+             E = MRI->use_end(); I != E; ++I)
+        Uses.push_back(&I.getOperand());
+
+      // We can optimize this.
+      unsigned NewReg = optimizeSDPattern(MI);
+
+      if (NewReg != 0) {
+        Modified = true;
+        for (SmallVector<MachineOperand*, 8>::const_iterator I = Uses.begin(),
+               E = Uses.end(); I != E; ++I) {
+          DEBUG(dbgs() << "Replacing operand "
+                       << **I << " with "
+                       << PrintReg(NewReg) << "\n");
+          (*I)->substVirtReg(NewReg, 0, *TRI);
+        }
+      }
+      Replacements[MI] = NewReg;
+    }
+  }
+  return Modified;
+}
+
+bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
+  TRI = Fn.getTarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  bool Modified = false;
+
+  DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
+
+  DeadInstr.clear();
+  Replacements.clear();
+
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+
+    for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
+      MI != ME;) {
+      Modified |= runOnInstruction(MI++);
+    }
+ 
+  }
+
+  for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
+                                            E = DeadInstr.end();
+                                            I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createA15SDOptimizerPass() {
+  return new A15SDOptimizer();
+}
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 5faf8c3..80e5f37 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -35,6 +35,7 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
 FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
                                           JITCodeEmitter &JCE);
 
+FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
 FunctionPass *createARMGlobalBaseRegPass();
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 58c7798..13ec208 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1357,7 +1357,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
       .addReg(ARM::PC)
-      .addImm(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(0).getReg())
       // Add predicate operands.
       .addImm(ARMCC::AL)
       .addReg(0)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index ed001ea..ed8b9cd 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1125,7 +1125,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
   // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
   // widened to VMOVD.  We prefer the VMOVD when possible because it may be
   // changed into a VORR that can go down the NEON pipeline.
-  if (!WidenVMOVS || !MI->isCopy())
+  if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15())
     return false;
 
   // Look for a copy between even S-registers.  That is where we keep floats
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 0ca6450..3b12408 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1038,58 +1038,6 @@ static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
   return FnSize;
 }
 
-/// estimateStackSize - Estimate and return the size of the frame.
-/// FIXME: Make generic?
-static unsigned estimateStackSize(MachineFunction &MF) {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
-  unsigned MaxAlign = MFI->getMaxAlignment();
-  int Offset = 0;
-
-  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
-  // It really should be refactored to share code. Until then, changes
-  // should keep in mind that there's tight coupling between the two.
-
-  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -MFI->getObjectOffset(i);
-    if (FixedOff > Offset) Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-    if (MFI->isDeadObjectIndex(i))
-      continue;
-    Offset += MFI->getObjectSize(i);
-    unsigned Align = MFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset+Align-1)/Align*Align;
-
-    MaxAlign = std::max(Align, MaxAlign);
-  }
-
-  if (MFI->adjustsStack() && TFI->hasReservedCallFrame(MF))
-    Offset += MFI->getMaxCallFrameSize();
-
-  // Round up the size to a multiple of the alignment.  If the function has
-  // any calls or alloca's, align to the target's StackAlignment value to
-  // ensure that the callee's frame or the alloca data is suitably aligned;
-  // otherwise, for leaf functions, align to the TransientStackAlignment
-  // value.
-  unsigned StackAlign;
-  if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
-      (RegInfo->needsStackRealignment(MF) && MFI->getObjectIndexEnd() != 0))
-    StackAlign = TFI->getStackAlignment();
-  else
-    StackAlign = TFI->getTransientStackAlignment();
-
-  // If the frame pointer is eliminated, all frame offsets will be relative to
-  // SP not FP. Align to MaxAlign so this works.
-  StackAlign = std::max(StackAlign, MaxAlign);
-  unsigned AlignMask = StackAlign - 1;
-  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
-
-  return (unsigned)Offset;
-}
-
 /// estimateRSStackSizeLimit - Look at each instruction that references stack
 /// frames and return the stack size limit beyond which some of these
 /// instructions will require a scratch register during their expansion later.
@@ -1235,7 +1183,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // we've used all the registers and so R4 is already used, so not marking
     // it here will be OK.
     // FIXME: It will be better just to find spare register here.
-    unsigned StackSize = estimateStackSize(MF);
+    unsigned StackSize = MFI->estimateStackSize(MF);
     if (MFI->hasVarSizedObjects() || StackSize > 508)
       MRI.setPhysRegUsed(ARM::R4);
   }
@@ -1330,7 +1278,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   //        worth the effort and added fragility?
   bool BigStack =
     (RS &&
-     (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
+     (MFI->estimateStackSize(MF) +
+      ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
       estimateRSStackSizeLimit(MF, this)))
     || MFI->hasVarSizedObjects()
     || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index a83f052..2c51de2 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3155,7 +3155,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
 
       // Remap uses.
-      SDValue Glue = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1);
+      SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1);
       if (!SDValue(N, 0).use_empty()) {
         SDValue Result;
         if (isThumb)
@@ -3163,9 +3163,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         else {
           SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32);
           SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-              dl, MVT::i32, MVT::Glue, SDValue(Ld, 0), SubRegIdx, Glue);
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
           Result = SDValue(ResNode,0);
-          Glue = Result.getValue(1);
         }
         ReplaceUses(SDValue(N, 0), Result);
       }
@@ -3176,13 +3175,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         else {
           SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_1, MVT::i32);
           SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-              dl, MVT::i32, MVT::Glue, SDValue(Ld, 0), SubRegIdx, Glue);
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
           Result = SDValue(ResNode,0);
-          Glue = Result.getValue(1);
         }
         ReplaceUses(SDValue(N, 1), Result);
       }
-      ReplaceUses(SDValue(N, 2), Glue);
+      ReplaceUses(SDValue(N, 2), OutChain);
       return NULL;
     }
 
@@ -3195,9 +3193,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
       // Store exclusive double return a i32 value which is the return status
       // of the issued store.
-      std::vector<EVT> ResTys;
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::Other);
+      EVT ResTys[] = { MVT::i32, MVT::Other };
 
       bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
       // Place arguments in the right order.
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index ef96e56..514971f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -504,6 +504,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v2f64, Expand);
 
     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
@@ -521,6 +522,23 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 
+    // Mark v2f32 intrinsics.
+    setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
+
     // Neon does not support some operations on v1i64 and v2i64 types.
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
     // Custom handling for some quad-vector types to detect VMULL.
@@ -554,6 +572,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
 
+    // NEON only has FMA instructions as of VFP4.
+    if (!Subtarget->hasVFP4()) {
+      setOperationAction(ISD::FMA, MVT::v2f32, Expand);
+      setOperationAction(ISD::FMA, MVT::v4f32, Expand);
+    }
+
     setTargetDAGCombine(ISD::INTRINSIC_VOID);
     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -1581,7 +1605,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       // On ELF targets for PIC code, direct calls should go through the PLT
       unsigned OpFlags = 0;
       if (Subtarget->isTargetELF() &&
-                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
+          getTargetMachine().getRelocationModel() == Reloc::PIC_)
         OpFlags = ARMII::MO_PLT;
       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
     }
@@ -2247,8 +2271,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  if (RelocM == Reloc::PIC_) {
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GV,
@@ -2292,8 +2315,6 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   DebugLoc dl = Op.getDebugLoc();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   // FIXME: Enable this for static codegen when tool issues are fixed.  Also
   // update ARMFastISel::ARMMaterializeGV.
@@ -2321,6 +2342,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   if (RelocM == Reloc::Static) {
     CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
   } else {
+    ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
     ARMPCLabelIndex = AFI->createPICLabelUId();
     unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
     ARMConstantPoolValue *CPV =
@@ -2401,7 +2423,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
     EVT PtrVT = getPointerTy();
-    DebugLoc dl = Op.getDebugLoc();
     Reloc::Model RelocM = getTargetMachine().getRelocationModel();
     SDValue CPAddr;
     unsigned PCAdj = (RelocM != Reloc::PIC_)
@@ -2661,7 +2682,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
   CCInfo.AnalyzeFormalArguments(Ins,
                                 CCAssignFnForNode(CallConv, /* Return*/ false,
                                                   isVarArg));
-  
+
   SmallVector<SDValue, 16> ArgValues;
   int lastInsIndex = -1;
   SDValue ArgValue;
@@ -2776,7 +2797,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
             } else {
               int FI = MFI->CreateFixedObject(Flags.getByValSize(),
                                               VA.getLocMemOffset(), false);
-              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));              
+              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
             }
           } else {
             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
@@ -3573,7 +3594,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
-/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
@@ -3594,7 +3615,7 @@ static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
 /// bit-count for each 16-bit element from the operand.  We need slightly
 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
 /// 64/128-bit registers.
-/// 
+///
 /// Trace for v4i16:
 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
@@ -3625,7 +3646,7 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
 /// input    = [v0    v1    ] (vi: 32-bit elements)
 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
-/// vrev: N0 = [k1 k0 k3 k2 ] 
+/// vrev: N0 = [k1 k0 k3 k2 ]
 ///            [k0 k1 k2 k3 ]
 ///       N1 =+[k1 k0 k3 k2 ]
 ///            [k0 k2 k1 k3 ]
@@ -4403,7 +4424,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
     ValueCounts.insert(std::make_pair(V, 0));
     unsigned &Count = ValueCounts[V];
-    
+
     // Is this value dominant? (takes up more than half of the lanes)
     if (++Count > (NumElts / 2)) {
       hasDominantValue = true;
@@ -4431,8 +4452,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
       // If we are VDUPing a value that comes directly from a vector, that will
       // cause an unnecessary move to and from a GPR, where instead we could
-      // just use VDUPLANE.
-      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      // just use VDUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the VDUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
         // We need to create a new undef vector to use for the VDUPLANE if the
         // size of the vector from which we get the value is different than the
         // size of the vector that we need to create. We will insert the element
@@ -4447,12 +4471,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
                         Value, DAG.getConstant(index, MVT::i32)),
                            DAG.getConstant(index, MVT::i32));
-        } else {
+        } else
           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                         Value->getOperand(0), Value->getOperand(1));
-        }
-      }
-      else
+      } else
         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
 
       if (!usesOnlyOneValue) {
@@ -4484,7 +4506,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (usesOnlyOneValue) {
       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
       if (isConstant && Val.getNode())
-        return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
     }
   }
 
@@ -6329,6 +6351,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
   unsigned UId = AFI->createJumpTableUId();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
   // Create the MBBs for the dispatch code.
 
@@ -6338,14 +6361,11 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   unsigned trap_opcode;
-  if (Subtarget->isThumb()) {
+  if (Subtarget->isThumb())
     trap_opcode = ARM::tTRAP;
-  } else {
-    if (Subtarget->useNaClTrap())
-      trap_opcode = ARM::TRAPNaCl;
-    else
-      trap_opcode = ARM::TRAP;
-  }
+  else
+    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+
   BuildMI(TrapBB, dl, TII->get(trap_opcode));
   DispatchBB->addSuccessor(TrapBB);
 
@@ -6492,11 +6512,14 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                    .addImm(0)
                    .addMemOperand(JTMMOLd));
 
-    unsigned NewVReg6 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg5, RegState::Kill)
-                   .addReg(NewVReg3));
+    unsigned NewVReg6 = NewVReg5;
+    if (RelocM == Reloc::PIC_) {
+      NewVReg6 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                     .addReg(ARM::CPSR, RegState::Define)
+                     .addReg(NewVReg5, RegState::Kill)
+                     .addReg(NewVReg3));
+    }
 
     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
       .addReg(NewVReg6, RegState::Kill)
@@ -6576,11 +6599,18 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
       .addImm(0)
       .addMemOperand(JTMMOLd));
 
-    BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
-      .addReg(NewVReg5, RegState::Kill)
-      .addReg(NewVReg4)
-      .addJumpTableIndex(MJTI)
-      .addImm(UId);
+    if (RelocM == Reloc::PIC_) {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+        .addReg(NewVReg5, RegState::Kill)
+        .addReg(NewVReg4)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    } else {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
+        .addReg(NewVReg5, RegState::Kill)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    }
   }
 
   // Add the jump table entries as successors to the MBB.
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 7745218..3003760 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -28,6 +28,11 @@ EnableGlobalMerge("global-merge", cl::Hidden,
                   cl::desc("Enable global merge pass"),
                   cl::init(true));
 
+static cl::opt<bool>
+DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
+                   cl::desc("Inhibit optimization of S->D register accesses on A15"),
+                   cl::init(false));
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
@@ -164,6 +169,12 @@ bool ARMPassConfig::addPreRegAlloc() {
     addPass(createARMLoadStoreOptimizationPass(true));
   if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9())
     addPass(createMLxExpansionPass());
+  // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
+  // enabled when NEON is available.
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA15() &&
+    getARMSubtarget().hasNEON() && !DisableA15SDOptimization) {
+    addPass(createA15SDOptimizerPass());
+  }
   return true;
 }
 
@@ -174,7 +185,8 @@ bool ARMPassConfig::addPreSched2() {
       addPass(createARMLoadStoreOptimizationPass());
       printAndVerify("After ARM load / store optimizer");
     }
-    if (getARMSubtarget().hasNEON())
+    if ((DisableA15SDOptimization || !getARMSubtarget().isCortexA15()) &&
+      getARMSubtarget().hasNEON())
       addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
 
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 01c04b4..140a8db 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -177,6 +177,23 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // Single to/from double precision conversions.
+  static const CostTblEntry<MVT> NEONFltDblTbl[] = {
+    // Vector fptrunc/fpext conversions.
+    { ISD::FP_ROUND,   MVT::v2f64, 2 },
+    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
+    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
+  };
+
+  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
+                                          ISD == ISD::FP_EXTEND)) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+    int Idx = CostTableLookup<MVT>(NEONFltDblTbl, array_lengthof(NEONFltDblTbl),
+                                ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * NEONFltDblTbl[Idx].Cost;
+  }
+
   EVT SrcTy = TLI->getValueType(Src);
   EVT DstTy = TLI->getValueType(Dst);
 
@@ -194,17 +211,63 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
 
+    // Operations that we legalize using load/stores to the stack.
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*4 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*3 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
+    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
+
     // Vector float <-> i32 conversions.
     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+
     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
 
     // Vector double <-> i32 conversions.
     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
-    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 }
+    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
   };
 
   if (SrcTy.isVector() && ST->hasNEON()) {
@@ -247,7 +310,6 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
         return NEONFloatConversionTbl[Idx].Cost;
   }
 
-
   // Scalar integer to float conversions.
   static const TypeConversionCostTblEntry<MVT> NEONIntegerConversionTbl[] = {
     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
@@ -303,7 +365,6 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
       return ARMIntegerConversionTbl[Idx].Cost;
   }
 
-
   return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 }
 
@@ -326,6 +387,25 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a a vector select gets lowered to vbsl.
   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // Lowering of some vector selects is currently far from perfect.
+    static const TypeConversionCostTblEntry<MVT> NEONVectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    int Idx = ConvertCostTableLookup<MVT>(NEONVectorSelectTbl,
+                                          array_lengthof(NEONVectorSelectTbl),
+                                          ISD, SelCondTy.getSimpleVT(),
+                                          SelValTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONVectorSelectTbl[Idx].Cost;
+
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
     return LT.first;
   }
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 6c678fd..c897efd 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -316,103 +316,127 @@ class ARMOperand : public MCParsedAsmOperand {
   SMLoc StartLoc, EndLoc;
   SmallVector<unsigned, 8> Registers;
 
+  struct CCOp {
+    ARMCC::CondCodes Val;
+  };
+
+  struct CopOp {
+    unsigned Val;
+  };
+
+  struct CoprocOptionOp {
+    unsigned Val;
+  };
+
+  struct ITMaskOp {
+    unsigned Mask:4;
+  };
+
+  struct MBOptOp {
+    ARM_MB::MemBOpt Val;
+  };
+
+  struct IFlagsOp {
+    ARM_PROC::IFlags Val;
+  };
+
+  struct MMaskOp {
+    unsigned Val;
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  // A vector register list is a sequential list of 1 to 4 registers.
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned LaneIndex;
+    bool isDoubleSpaced;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  /// Combined record for all forms of ARM address expressions.
+  struct MemoryOp {
+    unsigned BaseRegNum;
+    // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
+    // was specified.
+    const MCConstantExpr *OffsetImm;  // Offset immediate value
+    unsigned OffsetRegNum;    // Offset register num, when OffsetImm == NULL
+    ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
+    unsigned ShiftImm;        // shift for OffsetReg.
+    unsigned Alignment;       // 0 = no alignment specified
+    // n = alignment in bytes (2, 4, 8, 16, or 32)
+    unsigned isNegative : 1;  // Negated OffsetReg? (~'U' bit)
+  };
+
+  struct PostIdxRegOp {
+    unsigned RegNum;
+    bool isAdd;
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned ShiftImm;
+  };
+
+  struct ShifterImmOp {
+    bool isASR;
+    unsigned Imm;
+  };
+
+  struct RegShiftedRegOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftReg;
+    unsigned ShiftImm;
+  };
+
+  struct RegShiftedImmOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftImm;
+  };
+
+  struct RotImmOp {
+    unsigned Imm;
+  };
+
+  struct BitfieldOp {
+    unsigned LSB;
+    unsigned Width;
+  };
+
   union {
-    struct {
-      ARMCC::CondCodes Val;
-    } CC;
-
-    struct {
-      unsigned Val;
-    } Cop;
-
-    struct {
-      unsigned Val;
-    } CoprocOption;
-
-    struct {
-      unsigned Mask:4;
-    } ITMask;
-
-    struct {
-      ARM_MB::MemBOpt Val;
-    } MBOpt;
-
-    struct {
-      ARM_PROC::IFlags Val;
-    } IFlags;
-
-    struct {
-      unsigned Val;
-    } MMask;
-
-    struct {
-      const char *Data;
-      unsigned Length;
-    } Tok;
-
-    struct {
-      unsigned RegNum;
-    } Reg;
-
-    // A vector register list is a sequential list of 1 to 4 registers.
-    struct {
-      unsigned RegNum;
-      unsigned Count;
-      unsigned LaneIndex;
-      bool isDoubleSpaced;
-    } VectorList;
-
-    struct {
-      unsigned Val;
-    } VectorIndex;
-
-    struct {
-      const MCExpr *Val;
-    } Imm;
-
-    /// Combined record for all forms of ARM address expressions.
-    struct {
-      unsigned BaseRegNum;
-      // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
-      // was specified.
-      const MCConstantExpr *OffsetImm;  // Offset immediate value
-      unsigned OffsetRegNum;    // Offset register num, when OffsetImm == NULL
-      ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
-      unsigned ShiftImm;        // shift for OffsetReg.
-      unsigned Alignment;       // 0 = no alignment specified
-                                // n = alignment in bytes (2, 4, 8, 16, or 32)
-      unsigned isNegative : 1;  // Negated OffsetReg? (~'U' bit)
-    } Memory;
-
-    struct {
-      unsigned RegNum;
-      bool isAdd;
-      ARM_AM::ShiftOpc ShiftTy;
-      unsigned ShiftImm;
-    } PostIdxReg;
-
-    struct {
-      bool isASR;
-      unsigned Imm;
-    } ShifterImm;
-    struct {
-      ARM_AM::ShiftOpc ShiftTy;
-      unsigned SrcReg;
-      unsigned ShiftReg;
-      unsigned ShiftImm;
-    } RegShiftedReg;
-    struct {
-      ARM_AM::ShiftOpc ShiftTy;
-      unsigned SrcReg;
-      unsigned ShiftImm;
-    } RegShiftedImm;
-    struct {
-      unsigned Imm;
-    } RotImm;
-    struct {
-      unsigned LSB;
-      unsigned Width;
-    } Bitfield;
+    struct CCOp CC;
+    struct CopOp Cop;
+    struct CoprocOptionOp CoprocOption;
+    struct MBOptOp MBOpt;
+    struct ITMaskOp ITMask;
+    struct IFlagsOp IFlags;
+    struct MMaskOp MMask;
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct MemoryOp Memory;
+    struct PostIdxRegOp PostIdxReg;
+    struct ShifterImmOp ShifterImm;
+    struct RegShiftedRegOp RegShiftedReg;
+    struct RegShiftedImmOp RegShiftedImm;
+    struct RotImmOp RotImm;
+    struct BitfieldOp Bitfield;
   };
 
   ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 586834c..b832508 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(ARMCommonTableGen)
 
 add_llvm_target(ARMCodeGen
+  A15SDOptimizer.cpp
   ARMAsmPrinter.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 09e15af..7a59a7d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -655,15 +655,28 @@ getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
   int32_t offset = MO.getImm();
   uint32_t Val = 0x2000;
 
+  int SoImmVal;
   if (offset == INT32_MIN) {
     Val = 0x1000;
-    offset = 0;
+    SoImmVal = 0;
   } else if (offset < 0) {
     Val = 0x1000;
     offset *= -1;
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x2000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
+  } else {
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x1000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
   }
 
-  int SoImmVal = ARM_AM::getSOImmVal(offset);
   assert(SoImmVal != -1 && "Not a valid so_imm value!");
 
   Val |= SoImmVal;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index fac931a..0a8b1af 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -103,6 +103,16 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
   if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     ofst = State.AllocateStack(4, 4);
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
@@ -1024,6 +1034,14 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
   return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result);
 }
 
+SDValue
+HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD);
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
@@ -1297,6 +1315,7 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     // Custom legalize GlobalAddress nodes into CONST32.
     setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     setOperationAction(ISD::GlobalAddress, MVT::i8, Custom);
+    setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
     // Truncate action?
     setOperationAction(ISD::TRUNCATE, MVT::i64, Expand);
 
@@ -1342,7 +1361,6 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
 
     }
 
-    setOperationAction(ISD::BR_CC, MVT::Other, Expand);
     setOperationAction(ISD::BRIND, MVT::Other, Expand);
     if (EmitJumpTables) {
       setOperationAction(ISD::BR_JT, MVT::Other, Custom);
@@ -1352,6 +1370,9 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     // Increase jump tables cutover to 5, was 4.
     setMinimumJumpTableEntries(5);
 
+    setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+    setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::i1,  Expand);
     setOperationAction(ISD::BR_CC, MVT::i32, Expand);
 
     setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
@@ -1365,6 +1386,29 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     setOperationAction(ISD::FREM , MVT::f32, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // In V4, we have double word add/sub with carry. The problem with
+    // modelling this instruction is that it produces 2 results - Rdd and Px.
+    // To model update of Px, we will have to use Defs[p0..p3] which will
+    // cause any predicate live range to spill. So, we pretend we dont't
+    // have these instructions.
+    setOperationAction(ISD::ADDE, MVT::i8, Expand);
+    setOperationAction(ISD::ADDE, MVT::i16, Expand);
+    setOperationAction(ISD::ADDE, MVT::i32, Expand);
+    setOperationAction(ISD::ADDE, MVT::i64, Expand);
+    setOperationAction(ISD::SUBE, MVT::i8, Expand);
+    setOperationAction(ISD::SUBE, MVT::i16, Expand);
+    setOperationAction(ISD::SUBE, MVT::i32, Expand);
+    setOperationAction(ISD::SUBE, MVT::i64, Expand);
+    setOperationAction(ISD::ADDC, MVT::i8, Expand);
+    setOperationAction(ISD::ADDC, MVT::i16, Expand);
+    setOperationAction(ISD::ADDC, MVT::i32, Expand);
+    setOperationAction(ISD::ADDC, MVT::i64, Expand);
+    setOperationAction(ISD::SUBC, MVT::i8, Expand);
+    setOperationAction(ISD::SUBC, MVT::i16, Expand);
+    setOperationAction(ISD::SUBC, MVT::i32, Expand);
+    setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
     setOperationAction(ISD::CTTZ , MVT::i32, Expand);
@@ -1436,6 +1480,8 @@ HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
     default: return 0;
     case HexagonISD::CONST32:     return "HexagonISD::CONST32";
+    case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
+    case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real";
     case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
     case HexagonISD::CMPICC:      return "HexagonISD::CMPICC";
     case HexagonISD::CMPFCC:      return "HexagonISD::CMPFCC";
@@ -1484,6 +1530,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
     case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
     case ISD::GlobalAddress:      return LowerGLOBALADDRESS(Op, DAG);
+    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
     case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 65dab85..3279cc6 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -27,6 +27,7 @@ namespace llvm {
 
       CONST32,
       CONST32_GP,  // For marking data present in GP.
+      CONST32_Int_Real,
       FCONST32,
       SETCC,
       ADJDYNALLOC,
@@ -106,6 +107,7 @@ namespace llvm {
                                  DebugLoc dl, SelectionDAG &DAG,
                                  SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                       SmallVectorImpl<SDValue> &InVals) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index d30cdda..96a252e 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -557,218 +557,43 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
 }
 
 bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
-  switch(MI->getOpcode()) {
-    default: return false;
-    // JMP_EQri
-    case Hexagon::JMP_EQriPt_nv_V4:
-    case Hexagon::JMP_EQriPnt_nv_V4:
-    case Hexagon::JMP_EQriNotPt_nv_V4:
-    case Hexagon::JMP_EQriNotPnt_nv_V4:
-
-    // JMP_EQri - with -1
-    case Hexagon::JMP_EQriPtneg_nv_V4:
-    case Hexagon::JMP_EQriPntneg_nv_V4:
-    case Hexagon::JMP_EQriNotPtneg_nv_V4:
-    case Hexagon::JMP_EQriNotPntneg_nv_V4:
-
-    // JMP_EQrr
-    case Hexagon::JMP_EQrrPt_nv_V4:
-    case Hexagon::JMP_EQrrPnt_nv_V4:
-    case Hexagon::JMP_EQrrNotPt_nv_V4:
-    case Hexagon::JMP_EQrrNotPnt_nv_V4:
-
-    // JMP_GTri
-    case Hexagon::JMP_GTriPt_nv_V4:
-    case Hexagon::JMP_GTriPnt_nv_V4:
-    case Hexagon::JMP_GTriNotPt_nv_V4:
-    case Hexagon::JMP_GTriNotPnt_nv_V4:
-
-    // JMP_GTri - with -1
-    case Hexagon::JMP_GTriPtneg_nv_V4:
-    case Hexagon::JMP_GTriPntneg_nv_V4:
-    case Hexagon::JMP_GTriNotPtneg_nv_V4:
-    case Hexagon::JMP_GTriNotPntneg_nv_V4:
-
-    // JMP_GTrr
-    case Hexagon::JMP_GTrrPt_nv_V4:
-    case Hexagon::JMP_GTrrPnt_nv_V4:
-    case Hexagon::JMP_GTrrNotPt_nv_V4:
-    case Hexagon::JMP_GTrrNotPnt_nv_V4:
-
-    // JMP_GTrrdn
-    case Hexagon::JMP_GTrrdnPt_nv_V4:
-    case Hexagon::JMP_GTrrdnPnt_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
-
-    // JMP_GTUri
-    case Hexagon::JMP_GTUriPt_nv_V4:
-    case Hexagon::JMP_GTUriPnt_nv_V4:
-    case Hexagon::JMP_GTUriNotPt_nv_V4:
-    case Hexagon::JMP_GTUriNotPnt_nv_V4:
-
-    // JMP_GTUrr
-    case Hexagon::JMP_GTUrrPt_nv_V4:
-    case Hexagon::JMP_GTUrrPnt_nv_V4:
-    case Hexagon::JMP_GTUrrNotPt_nv_V4:
-    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
 
-    // JMP_GTUrrdn
-    case Hexagon::JMP_GTUrrdnPt_nv_V4:
-    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+  const MCInstrDesc &MID = MI->getDesc();
+  const uint64_t F = MID.TSFlags;
+  if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
+    return true;
 
-    // TFR_FI
+  // TODO: This is largely obsolete now. Will need to be removed
+  // in consecutive patches.
+  switch(MI->getOpcode()) {
+    // TFR_FI Remains a special case.
     case Hexagon::TFR_FI:
       return true;
+    default:
+      return false;
   }
+  return  false;
 }
 
+// This returns true in two cases:
+// - The OP code itself indicates that this is an extended instruction.
+// - One of MOs has been marked with HMOTF_ConstExtended flag.
 bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
-  switch(MI->getOpcode()) {
-    default: return false;
-    // JMP_EQri
-    case Hexagon::JMP_EQriPt_ie_nv_V4:
-    case Hexagon::JMP_EQriPnt_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
-
-    // JMP_EQri - with -1
-    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
-    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
-
-    // JMP_EQrr
-    case Hexagon::JMP_EQrrPt_ie_nv_V4:
-    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
-    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
-    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
-
-    // JMP_GTri
-    case Hexagon::JMP_GTriPt_ie_nv_V4:
-    case Hexagon::JMP_GTriPnt_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
-
-    // JMP_GTri - with -1
-    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
-    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
-
-    // JMP_GTrr
-    case Hexagon::JMP_GTrrPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
-    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
-
-    // JMP_GTrrdn
-    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
-
-    // JMP_GTUri
-    case Hexagon::JMP_GTUriPt_ie_nv_V4:
-    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
-    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
-
-    // JMP_GTUrr
-    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
-
-    // JMP_GTUrrdn
-    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
-
-    // V4 absolute set addressing.
-    case Hexagon::LDrid_abs_setimm_V4:
-    case Hexagon::LDriw_abs_setimm_V4:
-    case Hexagon::LDrih_abs_setimm_V4:
-    case Hexagon::LDrib_abs_setimm_V4:
-    case Hexagon::LDriuh_abs_setimm_V4:
-    case Hexagon::LDriub_abs_setimm_V4:
-
-    case Hexagon::STrid_abs_setimm_V4:
-    case Hexagon::STrib_abs_setimm_V4:
-    case Hexagon::STrih_abs_setimm_V4:
-    case Hexagon::STriw_abs_setimm_V4:
-
-    // V4 global address load.
-    case Hexagon::LDd_GP_cPt_V4 :
-    case Hexagon::LDd_GP_cNotPt_V4 :
-    case Hexagon::LDd_GP_cdnPt_V4 :
-    case Hexagon::LDd_GP_cdnNotPt_V4 :
-    case Hexagon::LDb_GP_cPt_V4 :
-    case Hexagon::LDb_GP_cNotPt_V4 :
-    case Hexagon::LDb_GP_cdnPt_V4 :
-    case Hexagon::LDb_GP_cdnNotPt_V4 :
-    case Hexagon::LDub_GP_cPt_V4 :
-    case Hexagon::LDub_GP_cNotPt_V4 :
-    case Hexagon::LDub_GP_cdnPt_V4 :
-    case Hexagon::LDub_GP_cdnNotPt_V4 :
-    case Hexagon::LDh_GP_cPt_V4 :
-    case Hexagon::LDh_GP_cNotPt_V4 :
-    case Hexagon::LDh_GP_cdnPt_V4 :
-    case Hexagon::LDh_GP_cdnNotPt_V4 :
-    case Hexagon::LDuh_GP_cPt_V4 :
-    case Hexagon::LDuh_GP_cNotPt_V4 :
-    case Hexagon::LDuh_GP_cdnPt_V4 :
-    case Hexagon::LDuh_GP_cdnNotPt_V4 :
-    case Hexagon::LDw_GP_cPt_V4 :
-    case Hexagon::LDw_GP_cNotPt_V4 :
-    case Hexagon::LDw_GP_cdnPt_V4 :
-    case Hexagon::LDw_GP_cdnNotPt_V4 :
-
-    // V4 global address store.
-    case Hexagon::STd_GP_cPt_V4 :
-    case Hexagon::STd_GP_cNotPt_V4 :
-    case Hexagon::STd_GP_cdnPt_V4 :
-    case Hexagon::STd_GP_cdnNotPt_V4 :
-    case Hexagon::STb_GP_cPt_V4 :
-    case Hexagon::STb_GP_cNotPt_V4 :
-    case Hexagon::STb_GP_cdnPt_V4 :
-    case Hexagon::STb_GP_cdnNotPt_V4 :
-    case Hexagon::STh_GP_cPt_V4 :
-    case Hexagon::STh_GP_cNotPt_V4 :
-    case Hexagon::STh_GP_cdnPt_V4 :
-    case Hexagon::STh_GP_cdnNotPt_V4 :
-    case Hexagon::STw_GP_cPt_V4 :
-    case Hexagon::STw_GP_cNotPt_V4 :
-    case Hexagon::STw_GP_cdnPt_V4 :
-    case Hexagon::STw_GP_cdnNotPt_V4 :
-
-    // V4 predicated global address new value store.
-    case Hexagon::STb_GP_cPt_nv_V4 :
-    case Hexagon::STb_GP_cNotPt_nv_V4 :
-    case Hexagon::STb_GP_cdnPt_nv_V4 :
-    case Hexagon::STb_GP_cdnNotPt_nv_V4 :
-    case Hexagon::STh_GP_cPt_nv_V4 :
-    case Hexagon::STh_GP_cNotPt_nv_V4 :
-    case Hexagon::STh_GP_cdnPt_nv_V4 :
-    case Hexagon::STh_GP_cdnNotPt_nv_V4 :
-    case Hexagon::STw_GP_cPt_nv_V4 :
-    case Hexagon::STw_GP_cNotPt_nv_V4 :
-    case Hexagon::STw_GP_cdnPt_nv_V4 :
-    case Hexagon::STw_GP_cdnNotPt_nv_V4 :
-
-    // TFR_FI
-    case Hexagon::TFR_FI_immext_V4:
-
-    // TFRI_F
-    case Hexagon::TFRI_f:
-    case Hexagon::TFRI_cPt_f:
-    case Hexagon::TFRI_cNotPt_f:
-    case Hexagon::CONST64_Float_Real:
+  // First check if this is permanently extended op code.
+  const uint64_t F = MI->getDesc().TSFlags;
+  if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
+    return true;
+  // Use MO operand flags to determine if one of MI's operands
+  // has HMOTF_ConstExtended flag set.
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+       E = MI->operands_end(); I != E; ++I) {
+    if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
       return true;
   }
+  return  false;
 }
 
 bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
@@ -877,258 +702,6 @@ bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
   }
 }
 
-unsigned HexagonInstrInfo::getImmExtForm(const MachineInstr* MI) const {
-  switch(MI->getOpcode()) {
-    default: llvm_unreachable("Unknown type of instruction.");
-    // JMP_EQri
-    case Hexagon::JMP_EQriPt_nv_V4:
-      return Hexagon::JMP_EQriPt_ie_nv_V4;
-    case Hexagon::JMP_EQriNotPt_nv_V4:
-      return Hexagon::JMP_EQriNotPt_ie_nv_V4;
-    case Hexagon::JMP_EQriPnt_nv_V4:
-      return Hexagon::JMP_EQriPnt_ie_nv_V4;
-    case Hexagon::JMP_EQriNotPnt_nv_V4:
-      return Hexagon::JMP_EQriNotPnt_ie_nv_V4;
-
-    // JMP_EQri -- with -1
-    case Hexagon::JMP_EQriPtneg_nv_V4:
-      return Hexagon::JMP_EQriPtneg_ie_nv_V4;
-    case Hexagon::JMP_EQriNotPtneg_nv_V4:
-      return Hexagon::JMP_EQriNotPtneg_ie_nv_V4;
-    case Hexagon::JMP_EQriPntneg_nv_V4:
-      return Hexagon::JMP_EQriPntneg_ie_nv_V4;
-    case Hexagon::JMP_EQriNotPntneg_nv_V4:
-      return Hexagon::JMP_EQriNotPntneg_ie_nv_V4;
-
-    // JMP_EQrr
-    case Hexagon::JMP_EQrrPt_nv_V4:
-      return Hexagon::JMP_EQrrPt_ie_nv_V4;
-    case Hexagon::JMP_EQrrNotPt_nv_V4:
-      return Hexagon::JMP_EQrrNotPt_ie_nv_V4;
-    case Hexagon::JMP_EQrrPnt_nv_V4:
-      return Hexagon::JMP_EQrrPnt_ie_nv_V4;
-    case Hexagon::JMP_EQrrNotPnt_nv_V4:
-      return Hexagon::JMP_EQrrNotPnt_ie_nv_V4;
-
-    // JMP_GTri
-    case Hexagon::JMP_GTriPt_nv_V4:
-      return Hexagon::JMP_GTriPt_ie_nv_V4;
-    case Hexagon::JMP_GTriNotPt_nv_V4:
-      return Hexagon::JMP_GTriNotPt_ie_nv_V4;
-    case Hexagon::JMP_GTriPnt_nv_V4:
-      return Hexagon::JMP_GTriPnt_ie_nv_V4;
-    case Hexagon::JMP_GTriNotPnt_nv_V4:
-      return Hexagon::JMP_GTriNotPnt_ie_nv_V4;
-
-    // JMP_GTri -- with -1
-    case Hexagon::JMP_GTriPtneg_nv_V4:
-      return Hexagon::JMP_GTriPtneg_ie_nv_V4;
-    case Hexagon::JMP_GTriNotPtneg_nv_V4:
-      return Hexagon::JMP_GTriNotPtneg_ie_nv_V4;
-    case Hexagon::JMP_GTriPntneg_nv_V4:
-      return Hexagon::JMP_GTriPntneg_ie_nv_V4;
-    case Hexagon::JMP_GTriNotPntneg_nv_V4:
-      return Hexagon::JMP_GTriNotPntneg_ie_nv_V4;
-
-    // JMP_GTrr
-    case Hexagon::JMP_GTrrPt_nv_V4:
-      return Hexagon::JMP_GTrrPt_ie_nv_V4;
-    case Hexagon::JMP_GTrrNotPt_nv_V4:
-      return Hexagon::JMP_GTrrNotPt_ie_nv_V4;
-    case Hexagon::JMP_GTrrPnt_nv_V4:
-      return Hexagon::JMP_GTrrPnt_ie_nv_V4;
-    case Hexagon::JMP_GTrrNotPnt_nv_V4:
-      return Hexagon::JMP_GTrrNotPnt_ie_nv_V4;
-
-    // JMP_GTrrdn
-    case Hexagon::JMP_GTrrdnPt_nv_V4:
-      return Hexagon::JMP_GTrrdnPt_ie_nv_V4;
-    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
-      return Hexagon::JMP_GTrrdnNotPt_ie_nv_V4;
-    case Hexagon::JMP_GTrrdnPnt_nv_V4:
-      return Hexagon::JMP_GTrrdnPnt_ie_nv_V4;
-    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
-      return Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4;
-
-    // JMP_GTUri
-    case Hexagon::JMP_GTUriPt_nv_V4:
-      return Hexagon::JMP_GTUriPt_ie_nv_V4;
-    case Hexagon::JMP_GTUriNotPt_nv_V4:
-      return Hexagon::JMP_GTUriNotPt_ie_nv_V4;
-    case Hexagon::JMP_GTUriPnt_nv_V4:
-      return Hexagon::JMP_GTUriPnt_ie_nv_V4;
-    case Hexagon::JMP_GTUriNotPnt_nv_V4:
-      return Hexagon::JMP_GTUriNotPnt_ie_nv_V4;
-
-    // JMP_GTUrr
-    case Hexagon::JMP_GTUrrPt_nv_V4:
-      return Hexagon::JMP_GTUrrPt_ie_nv_V4;
-    case Hexagon::JMP_GTUrrNotPt_nv_V4:
-      return Hexagon::JMP_GTUrrNotPt_ie_nv_V4;
-    case Hexagon::JMP_GTUrrPnt_nv_V4:
-      return Hexagon::JMP_GTUrrPnt_ie_nv_V4;
-    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
-      return Hexagon::JMP_GTUrrNotPnt_ie_nv_V4;
-
-    // JMP_GTUrrdn
-    case Hexagon::JMP_GTUrrdnPt_nv_V4:
-      return Hexagon::JMP_GTUrrdnPt_ie_nv_V4;
-    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
-      return Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4;
-    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
-      return Hexagon::JMP_GTUrrdnPnt_ie_nv_V4;
-    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
-      return Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4;
-
-    case Hexagon::TFR_FI:
-        return Hexagon::TFR_FI_immext_V4;
-
-    case Hexagon::MEMw_ADDi_indexed_MEM_V4 :
-    case Hexagon::MEMw_SUBi_indexed_MEM_V4 :
-    case Hexagon::MEMw_ADDr_indexed_MEM_V4 :
-    case Hexagon::MEMw_SUBr_indexed_MEM_V4 :
-    case Hexagon::MEMw_ANDr_indexed_MEM_V4 :
-    case Hexagon::MEMw_ORr_indexed_MEM_V4 :
-    case Hexagon::MEMw_ADDi_MEM_V4 :
-    case Hexagon::MEMw_SUBi_MEM_V4 :
-    case Hexagon::MEMw_ADDr_MEM_V4 :
-    case Hexagon::MEMw_SUBr_MEM_V4 :
-    case Hexagon::MEMw_ANDr_MEM_V4 :
-    case Hexagon::MEMw_ORr_MEM_V4 :
-    case Hexagon::MEMh_ADDi_indexed_MEM_V4 :
-    case Hexagon::MEMh_SUBi_indexed_MEM_V4 :
-    case Hexagon::MEMh_ADDr_indexed_MEM_V4 :
-    case Hexagon::MEMh_SUBr_indexed_MEM_V4 :
-    case Hexagon::MEMh_ANDr_indexed_MEM_V4 :
-    case Hexagon::MEMh_ORr_indexed_MEM_V4 :
-    case Hexagon::MEMh_ADDi_MEM_V4 :
-    case Hexagon::MEMh_SUBi_MEM_V4 :
-    case Hexagon::MEMh_ADDr_MEM_V4 :
-    case Hexagon::MEMh_SUBr_MEM_V4 :
-    case Hexagon::MEMh_ANDr_MEM_V4 :
-    case Hexagon::MEMh_ORr_MEM_V4 :
-    case Hexagon::MEMb_ADDi_indexed_MEM_V4 :
-    case Hexagon::MEMb_SUBi_indexed_MEM_V4 :
-    case Hexagon::MEMb_ADDr_indexed_MEM_V4 :
-    case Hexagon::MEMb_SUBr_indexed_MEM_V4 :
-    case Hexagon::MEMb_ANDr_indexed_MEM_V4 :
-    case Hexagon::MEMb_ORr_indexed_MEM_V4 :
-    case Hexagon::MEMb_ADDi_MEM_V4 :
-    case Hexagon::MEMb_SUBi_MEM_V4 :
-    case Hexagon::MEMb_ADDr_MEM_V4 :
-    case Hexagon::MEMb_SUBr_MEM_V4 :
-    case Hexagon::MEMb_ANDr_MEM_V4 :
-    case Hexagon::MEMb_ORr_MEM_V4 :
-      llvm_unreachable("Needs implementing.");
-  }
-}
-
-unsigned HexagonInstrInfo::getNormalBranchForm(const MachineInstr* MI) const {
-  switch(MI->getOpcode()) {
-    default: llvm_unreachable("Unknown type of jump instruction.");
-    // JMP_EQri
-    case Hexagon::JMP_EQriPt_ie_nv_V4:
-      return Hexagon::JMP_EQriPt_nv_V4;
-    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
-      return Hexagon::JMP_EQriNotPt_nv_V4;
-    case Hexagon::JMP_EQriPnt_ie_nv_V4:
-      return Hexagon::JMP_EQriPnt_nv_V4;
-    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
-      return Hexagon::JMP_EQriNotPnt_nv_V4;
-
-    // JMP_EQri -- with -1
-    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
-      return Hexagon::JMP_EQriPtneg_nv_V4;
-    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
-      return Hexagon::JMP_EQriNotPtneg_nv_V4;
-    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
-      return Hexagon::JMP_EQriPntneg_nv_V4;
-    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
-      return Hexagon::JMP_EQriNotPntneg_nv_V4;
-
-    // JMP_EQrr
-    case Hexagon::JMP_EQrrPt_ie_nv_V4:
-      return Hexagon::JMP_EQrrPt_nv_V4;
-    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
-      return Hexagon::JMP_EQrrNotPt_nv_V4;
-    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
-      return Hexagon::JMP_EQrrPnt_nv_V4;
-    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
-      return Hexagon::JMP_EQrrNotPnt_nv_V4;
-
-    // JMP_GTri
-    case Hexagon::JMP_GTriPt_ie_nv_V4:
-      return Hexagon::JMP_GTriPt_nv_V4;
-    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
-      return Hexagon::JMP_GTriNotPt_nv_V4;
-    case Hexagon::JMP_GTriPnt_ie_nv_V4:
-      return Hexagon::JMP_GTriPnt_nv_V4;
-    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
-      return Hexagon::JMP_GTriNotPnt_nv_V4;
-
-    // JMP_GTri -- with -1
-    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
-      return Hexagon::JMP_GTriPtneg_nv_V4;
-    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
-      return Hexagon::JMP_GTriNotPtneg_nv_V4;
-    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
-      return Hexagon::JMP_GTriPntneg_nv_V4;
-    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
-      return Hexagon::JMP_GTriNotPntneg_nv_V4;
-
-    // JMP_GTrr
-    case Hexagon::JMP_GTrrPt_ie_nv_V4:
-      return Hexagon::JMP_GTrrPt_nv_V4;
-    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
-      return Hexagon::JMP_GTrrNotPt_nv_V4;
-    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
-      return Hexagon::JMP_GTrrPnt_nv_V4;
-    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
-      return Hexagon::JMP_GTrrNotPnt_nv_V4;
-
-    // JMP_GTrrdn
-    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
-      return Hexagon::JMP_GTrrdnPt_nv_V4;
-    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
-      return Hexagon::JMP_GTrrdnNotPt_nv_V4;
-    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
-      return Hexagon::JMP_GTrrdnPnt_nv_V4;
-    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
-      return Hexagon::JMP_GTrrdnNotPnt_nv_V4;
-
-    // JMP_GTUri
-    case Hexagon::JMP_GTUriPt_ie_nv_V4:
-      return Hexagon::JMP_GTUriPt_nv_V4;
-    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
-      return Hexagon::JMP_GTUriNotPt_nv_V4;
-    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
-      return Hexagon::JMP_GTUriPnt_nv_V4;
-    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
-      return Hexagon::JMP_GTUriNotPnt_nv_V4;
-
-    // JMP_GTUrr
-    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrPt_nv_V4;
-    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrNotPt_nv_V4;
-    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrPnt_nv_V4;
-    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrNotPnt_nv_V4;
-
-    // JMP_GTUrrdn
-    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrdnPt_nv_V4;
-    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrdnNotPt_nv_V4;
-    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrdnPnt_nv_V4;
-    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
-      return Hexagon::JMP_GTUrrdnNotPnt_nv_V4;
-  }
-}
-
-
 bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
     default: return false;
@@ -1326,6 +899,16 @@ bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
   }
 }
 
+bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
+  if (isNewValueJump(MI))
+    return true;
+
+  if (isNewValueStore(MI))
+    return true;
+
+  return false;
+}
+
 bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
   return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4;
 }
@@ -2366,6 +1949,10 @@ isValidOffset(const int Opcode, const int Offset) const {
   // the given "Opcode". If "Offset" is not in the correct range, "ADD_ri" is
   // inserted to calculate the final address. Due to this reason, the function
   // assumes that the "Offset" has correct alignment.
+  // We used to assert if the offset was not properly aligned, however,
+  // there are cases where a misaligned pointer recast can cause this
+  // problem, and we need to allow for it. The front end warns of such
+  // misaligns with respect to load size.
 
   switch(Opcode) {
 
@@ -2375,7 +1962,6 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::STriw_indexed:
   case Hexagon::STriw:
   case Hexagon::STriw_f:
-    assert((Offset % 4 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMW_OFFSET_MAX);
 
@@ -2385,14 +1971,12 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::STrid:
   case Hexagon::STrid_indexed:
   case Hexagon::STrid_f:
-    assert((Offset % 8 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMD_OFFSET_MAX);
 
   case Hexagon::LDrih:
   case Hexagon::LDriuh:
   case Hexagon::STrih:
-    assert((Offset % 2 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMH_OFFSET_MAX);
 
@@ -2419,7 +2003,6 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::MEMw_SUBr_MEM_V4 :
   case Hexagon::MEMw_ANDr_MEM_V4 :
   case Hexagon::MEMw_ORr_MEM_V4 :
-    assert ((Offset % 4) == 0 && "MEMOPw offset is not aligned correctly." );
     return (0 <= Offset && Offset <= 255);
 
   case Hexagon::MEMh_ADDi_indexed_MEM_V4 :
@@ -2434,7 +2017,6 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::MEMh_SUBr_MEM_V4 :
   case Hexagon::MEMh_ANDr_MEM_V4 :
   case Hexagon::MEMh_ORr_MEM_V4 :
-    assert ((Offset % 2) == 0 && "MEMOPh offset is not aligned correctly." );
     return (0 <= Offset && Offset <= 127);
 
   case Hexagon::MEMb_ADDi_indexed_MEM_V4 :
@@ -2800,7 +2382,26 @@ isConditionalStore (const MachineInstr* MI) const {
   }
 }
 
+unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return((F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask);
+}
 
+/// immediateExtend - Changes the instruction in place to one using an immediate
+/// extender.
+void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const {
+  assert((isExtendable(MI)||isConstExtended(MI)) &&
+                               "Instruction must be extendable");
+  // Find which operand is extendable.
+  short ExtOpNum = getCExtOpNum(MI);
+  MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // This needs to be something we understand.
+  assert((MO.isMBB() || MO.isImm()) &&
+         "Branch with unknown extendable field type");
+  // Mark given operand as extended.
+  MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+}
 
 DFAPacketizer *HexagonInstrInfo::
 CreateTargetScheduleState(const TargetMachine *TM,
@@ -2827,3 +2428,155 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
 
   return false;
 }
+
+bool HexagonInstrInfo::isConstExtended(MachineInstr *MI) const {
+
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+  if (isExtended) // Instruction must be extended.
+    return true;
+
+  unsigned isExtendable = (F >> HexagonII::ExtendablePos)
+                          & HexagonII::ExtendableMask;
+  if (!isExtendable)
+    return false;
+
+  short ExtOpNum = getCExtOpNum(MI);
+  const MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // Use MO operand flags to determine if MO
+  // has the HMOTF_ConstExtended flag set.
+  if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+    return true;
+  // If this is a Machine BB address we are talking about, and it is
+  // not marked as extended, say so.
+  if (MO.isMBB())
+    return false;
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isGlobal() || MO.isSymbol())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int MinValue = getMinValue(MI);
+  int MaxValue = getMaxValue(MI);
+  int ImmValue = MO.getImm();
+
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+// Returns true if a particular operand is extendable for an instruction.
+bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
+                                         unsigned short OperandNum) const {
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
+}
+
+// Returns Operand Index for the constant extended instruction.
+unsigned short HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+// Returns the min value that doesn't need to be extended.
+int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1 << (bits - 1);
+  else
+    return 0;
+}
+
+// Returns the max value that doesn't need to be extended.
+int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1 << (bits - 1));
+  else
+    return ~(-1 << bits);
+}
+
+// Returns true if an instruction can be converted into a non-extended
+// equivalent instruction.
+bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const {
+
+  short NonExtOpcode;
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  if (Hexagon::getRegForm(MI->getOpcode()) >= 0)
+    return true;
+
+  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+    // Check addressing mode and retreive non-ext equivalent instruction.
+
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      // Load/store with absolute addressing mode can be converted into
+      // base+offset mode.
+      NonExtOpcode = Hexagon::getBasedWithImmOffset(MI->getOpcode());
+      break;
+    case HexagonII::BaseImmOffset :
+      // Load/store with base+offset addressing mode can be converted into
+      // base+register offset addressing mode. However left shift operand should
+      // be set to 0.
+      NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode());
+      break;
+    default:
+      return false;
+    }
+    if (NonExtOpcode < 0)
+      return false;
+    return true;
+  }
+  return false;
+}
+
+// Returns opcode of the non-extended equivalent instruction.
+short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
+
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  short NonExtOpcode = Hexagon::getRegForm(MI->getOpcode());
+    if (NonExtOpcode >= 0)
+      return NonExtOpcode;
+
+  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+    // Check addressing mode and retreive non-ext equivalent instruction.
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      return Hexagon::getBasedWithImmOffset(MI->getOpcode());
+    case HexagonII::BaseImmOffset :
+      return Hexagon::getBaseWithRegOffset(MI->getOpcode());
+    default:
+      return -1;
+    }
+  }
+  return -1;
+}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 4e36dfb..d2f059a 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -169,6 +169,7 @@ public:
   bool isConditionalALU32 (const MachineInstr* MI) const;
   bool isConditionalLoad (const MachineInstr* MI) const;
   bool isConditionalStore(const MachineInstr* MI) const;
+  bool isNewValueInst(const MachineInstr* MI) const;
   bool isDeallocRet(const MachineInstr *MI) const;
   unsigned getInvertedPredicatedOpcode(const int Opc) const;
   bool isExtendable(const MachineInstr* MI) const;
@@ -177,9 +178,18 @@ public:
   bool isNewValueStore(const MachineInstr* MI) const;
   bool isNewValueJump(const MachineInstr* MI) const;
   bool isNewValueJumpCandidate(const MachineInstr *MI) const;
-  unsigned getImmExtForm(const MachineInstr* MI) const;
-  unsigned getNormalBranchForm(const MachineInstr* MI) const;
 
+
+  void immediateExtend(MachineInstr *MI) const;
+  bool isConstExtended(MachineInstr *MI) const;
+  unsigned getAddrMode(const MachineInstr* MI) const;
+  bool isOperandExtended(const MachineInstr *MI,
+                         unsigned short OperandNum) const;
+  unsigned short getCExtOpNum(const MachineInstr *MI) const;
+  int getMinValue(const MachineInstr *MI) const;
+  int getMaxValue(const MachineInstr *MI) const;
+  bool NonExtEquivalentExists (const MachineInstr *MI) const;
+  short getNonExtOpcode(const MachineInstr *MI) const;
 private:
   int getMatchingCondBranchOpcode(int Opc, bool sense) const;
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 082772a..d7bab20 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -251,28 +251,55 @@ multiclass TFR_base<string CextOp> {
   }
 }
 
+class T_TFR64_Pred<bit PredNot, bit isPredNew>
+            : ALU32_rr<(outs DoubleRegs:$dst),
+                       (ins PredRegs:$src1, DoubleRegs:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#
+            !if(isPredNew, ".new) ", ") ")#"$dst = $src2", []>
+{
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+
+    let IClass = 0b1111;
+    let Inst{27-24} = 0b1101;
+    let Inst{13} = isPredNew;
+    let Inst{7} = PredNot;
+    let Inst{4-0} = dst;
+    let Inst{6-5} = src1;
+    let Inst{20-17} = src2{4-1};
+    let Inst{16} = 0b1;
+    let Inst{12-9} = src2{4-1};
+    let Inst{8} = 0b0;
+}
+
 multiclass TFR64_Pred<bit PredNot> {
   let PredSense = !if(PredNot, "false", "true") in {
-    def _c#NAME : ALU32_rr<(outs DoubleRegs:$dst),
-                           (ins PredRegs:$src1, DoubleRegs:$src2),
-            !if(PredNot, "if (!$src1", "if ($src1")#") $dst = $src2",
-            []>;
-    // Predicate new
+    def _c#NAME : T_TFR64_Pred<PredNot, 0>;
+
     let PNewValue = "new" in
-    def _cdn#NAME : ALU32_rr<(outs DoubleRegs:$dst),
-                             (ins PredRegs:$src1, DoubleRegs:$src2),
-            !if(PredNot, "if (!$src1", "if ($src1")#".new) $dst = $src2",
-            []>;
+    def _cdn#NAME : T_TFR64_Pred<PredNot, 1>; // Predicate new
   }
 }
 
-let InputType = "reg", neverHasSideEffects = 1 in
-multiclass TFR64_base<string CextOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+let neverHasSideEffects = 1 in
+multiclass TFR64_base<string BaseName> {
+  let BaseOpcode = BaseName in {
     let isPredicable = 1 in
-    def NAME : ALU32_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
-            "$dst = $src1",
-            []>;
+    def NAME : ALU32Inst <(outs DoubleRegs:$dst),
+                          (ins DoubleRegs:$src1),
+                          "$dst = $src1" > {
+        bits<5> dst;
+        bits<5> src1;
+
+        let IClass = 0b1111;
+        let Inst{27-23} = 0b01010;
+        let Inst{4-0} = dst;
+        let Inst{20-17} = src1{4-1};
+        let Inst{16} = 0b1;
+        let Inst{12-9} = src1{4-1};
+        let Inst{8} = 0b0;
+    }
 
     let  isPredicated = 1 in {
       defm Pt : TFR64_Pred<0>;
@@ -281,9 +308,8 @@ multiclass TFR64_base<string CextOp> {
   }
 }
 
-
 multiclass TFRI_Pred<bit PredNot> {
-  let PredSense = !if(PredNot, "false", "true") in {
+  let isMoveImm = 1, PredSense = !if(PredNot, "false", "true") in {
     def _c#NAME : ALU32_ri<(outs IntRegs:$dst),
                            (ins PredRegs:$src1, s12Ext:$src2),
             !if(PredNot, "if (!$src1", "if ($src1")#") $dst = #$src2",
@@ -301,8 +327,8 @@ multiclass TFRI_Pred<bit PredNot> {
 let InputType = "imm", isExtendable = 1, isExtentSigned = 1 in
 multiclass TFRI_base<string CextOp> {
   let CextOpcode = CextOp, BaseOpcode = CextOp#I in {
-    let opExtendable = 1, opExtentBits = 16, isMoveImm = 1, isPredicable = 1,
-    isReMaterializable = 1 in
+    let isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16,
+    isMoveImm = 1, isPredicable = 1, isReMaterializable = 1 in
     def NAME : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
             "$dst = #$src1",
             [(set (i32 IntRegs:$dst), s16ExtPred:$src1)]>;
@@ -317,7 +343,7 @@ multiclass TFRI_base<string CextOp> {
 
 defm TFRI : TFRI_base<"TFR">, ImmRegRel, PredNewRel;
 defm TFR : TFR_base<"TFR">, ImmRegRel, PredNewRel;
-defm TFR64 : TFR64_base<"TFR64">, ImmRegRel, PredNewRel;
+defm TFR64 : TFR64_base<"TFR64">, PredNewRel;
 
 // Transfer control register.
 let neverHasSideEffects = 1 in
@@ -2050,6 +2076,10 @@ def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global),
                        "$dst = CONST32(#$global)",
                        [(set (i32 IntRegs:$dst), imm:$global) ]>;
 
+// Map BlockAddress lowering to CONST32_Int_Real
+def : Pat<(HexagonCONST32_GP tblockaddress:$addr),
+          (CONST32_Int_Real tblockaddress:$addr)>;
+
 let isReMaterializable = 1, isMoveImm = 1 in
 def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label),
                     "$dst = CONST32($label)",
@@ -2845,6 +2875,18 @@ def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
                                   s11_0ExtPred:$offset)))>,
       Requires<[NoV4T]>;
 
+// i1 -> i64
+def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
 // i16 -> i64
 def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
       (i64 (COMBINE_rr (TFRI 0), (LDriuh ADDRriS11_1:$src1)))>,
@@ -3020,6 +3062,11 @@ def BR_JT : JRInst<(outs), (ins IntRegs:$src),
                    "jumpr $src",
                    [(HexagonBR_JT (i32 IntRegs:$src))]>;
 
+let isBranch=1, isIndirectBranch=1, isTerminator=1 in
+def BRIND : JRInst<(outs), (ins IntRegs:$src),
+                   "jumpr $src",
+                   [(brind (i32 IntRegs:$src))]>;
+
 def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
 
 def : Pat<(HexagonWrapperJT tjumptable:$dst),
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index e1b2f88..1d0643d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -940,6 +940,18 @@ def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
                                   s11_0ExtPred:$offset)))>,
       Requires<[HasV4T]>;
 
+// zext i1->i64
+def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
 // zext i16->i64
 def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
       (i64 (COMBINE_Ir_V4 0, (LDriuh ADDRriS11_1:$src1)))>,
@@ -3790,6 +3802,11 @@ def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$src1),
            [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>,
            Requires<[HasV4T]>;
 
+// Transfer a block address into a register
+def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
+          (TFRI_V4 tblockaddress:$src1)>,
+          Requires<[HasV4T]>;
+
 let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
 def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
                            (ins PredRegs:$src1, globaladdress:$src2),
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index ced17b3..1388ad4 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -15,7 +15,8 @@
 #define DEBUG_TYPE "misched"
 
 #include "HexagonMachineScheduler.h"
-#include <queue>
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
@@ -159,6 +160,9 @@ void VLIWMachineScheduler::schedule() {
   SchedImpl->initialize(this);
 
   // To view Height/Depth correctly, they should be accessed at least once.
+  //
+  // FIXME: SUnit::dumpAll always recompute depth and height now. The max
+  // depth/height could be computed directly from the roots and leaves.
   DEBUG(unsigned maxH = 0;
         for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           if (SUnits[su].getHeight() > maxH)
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index aff6b86..866beb1 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -17,35 +17,36 @@
 //
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "packets"
-#include "Hexagon.h"
-#include "HexagonMachineFunctionInfo.h"
-#include "HexagonRegisterInfo.h"
-#include "HexagonSubtarget.h"
-#include "HexagonTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+
 #include <map>
 
 using namespace llvm;
@@ -257,7 +258,7 @@ void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
 
 bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  assert(QII->isExtended(MI) &&
+  assert((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
          "Should only be called for constant extended instructions");
   MachineFunction *MF = MI->getParent()->getParent();
   MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
@@ -350,17 +351,6 @@ static bool IsControlFlow(MachineInstr* MI) {
   return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
 }
 
-bool HexagonPacketizerList::isNewValueInst(MachineInstr* MI) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  if (QII->isNewValueJump(MI))
-    return true;
-
-  if (QII->isNewValueStore(MI))
-    return true;
-
-  return false;
-}
-
 // Function returns true if an instruction can be promoted to the new-value
 // store. It will always return false for v2 and v3.
 // It lists all the conditional and unconditional stores that can be promoted
@@ -2165,7 +2155,8 @@ static bool GetPredicateSense(MachineInstr* MI,
 }
 
 bool HexagonPacketizerList::isDotNewInst(MachineInstr* MI) {
-  if (isNewValueInst(MI))
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  if (QII->isNewValueInst(MI))
     return true;
 
   switch (MI->getOpcode()) {
@@ -2893,13 +2884,13 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   // dealloc_return and memop always take SLOT0.
   // Arch spec 3.4.4.2
   if (QRI->Subtarget.hasV4TOps()) {
-
-    if (MCIDI.mayStore() && MCIDJ.mayStore() && isNewValueInst(J)) {
+    if (MCIDI.mayStore() && MCIDJ.mayStore() &&
+       (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) {
       Dependence = true;
       return false;
     }
 
-    if (   (QII->isMemOp(J) && MCIDI.mayStore())
+    if ((QII->isMemOp(J) && MCIDI.mayStore())
         || (MCIDJ.mayStore() && QII->isMemOp(I))
         || (QII->isMemOp(J) && QII->isMemOp(I))) {
       Dependence = true;
@@ -3196,7 +3187,7 @@ HexagonPacketizerList::addToPacket(MachineInstr *MI) {
       MachineInstr *nvjMI = MII;
       assert(ResourceTracker->canReserveResources(MI));
       ResourceTracker->reserveResources(MI);
-      if (QII->isExtended(MI) &&
+      if ((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
           !tryAllocateResourcesForConstExt(MI)) {
         endPacket(MBB, MI);
         ResourceTracker->reserveResources(MI);
@@ -3216,7 +3207,7 @@ HexagonPacketizerList::addToPacket(MachineInstr *MI) {
             && (!tryAllocateResourcesForConstExt(nvjMI)
                 || !ResourceTracker->canReserveResources(nvjMI)))
         || // For non-extended instruction, no need to allocate extra 4 bytes.
-        (!QII->isExtended(nvjMI) && 
+        (!QII->isExtended(nvjMI) &&
               !ResourceTracker->canReserveResources(nvjMI)))
       {
         endPacket(MBB, MI);
@@ -3232,7 +3223,7 @@ HexagonPacketizerList::addToPacket(MachineInstr *MI) {
       CurrentPacketMIs.push_back(MI);
       CurrentPacketMIs.push_back(nvjMI);
     } else {
-      if (   QII->isExtended(MI)
+      if (   (QII->isExtended(MI) || QII->isConstExtended(MI))
           && (   !tryAllocateResourcesForConstExt(MI)
               || !ResourceTracker->canReserveResources(MI)))
       {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 5f9718b..d4a93b5 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -154,6 +154,28 @@ namespace HexagonII {
 
   // *** The code above must match HexagonInstrFormat*.td *** //
 
+  // Hexagon specific MO operand flag mask.
+  enum HexagonMOTargetFlagVal {
+    //===------------------------------------------------------------------===//
+    // Hexagon Specific MachineOperand flags.
+    MO_NO_FLAG,
+
+    HMOTF_ConstExtended = 1,
+
+    /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
+    /// Used for computing a global address for PIC compilations
+    MO_PCREL,
+
+    /// MO_GOT - Indicates a GOT-relative relocation
+    MO_GOT,
+
+    // Low or high part of a symbol.
+    MO_LO16, MO_HI16,
+
+    // Offset from the base of the SDA.
+    MO_GPREL
+  };
+
 } // End namespace HexagonII.
 
 } // End namespace llvm.
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index ad495ff..dda6e24 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -82,29 +82,35 @@ struct MBlazeOperand : public MCParsedAsmOperand {
 
   SMLoc StartLoc, EndLoc;
 
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    unsigned OffReg;
+    const MCExpr *Off;
+  };
+
+  struct FslImmOp {
+    const MCExpr *Val;
+  };
+
   union {
-    struct {
-      const char *Data;
-      unsigned Length;
-    } Tok;
-
-    struct {
-      unsigned RegNum;
-    } Reg;
-
-    struct {
-      const MCExpr *Val;
-    } Imm;
-
-    struct {
-      unsigned Base;
-      unsigned OffReg;
-      const MCExpr *Off;
-    } Mem;
-
-    struct {
-      const MCExpr *Val;
-    } FslImm;
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+    struct FslImmOp FslImm;
   };
 
   MBlazeOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 7664c60..d4f9432 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -160,7 +160,8 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   // Operations not directly supported by MBlaze.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Expand);
   setOperationAction(ISD::BR_JT,              MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC,              MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,              MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC,              MVT::i32,   Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG,  MVT::i1,    Expand);
   setOperationAction(ISD::ROTL,               MVT::i32,   Expand);
   setOperationAction(ISD::ROTR,               MVT::i32,   Expand);
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index d0fd7dc..bd83afc 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -122,7 +122,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 }
 
 void MBlazeRegisterInfo::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *) const {
   // Set the stack offset where GP must be saved/loaded from.
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 99a2fac..497f386 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -55,7 +55,8 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = NULL) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS = NULL) const;
 
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index ae2e556..e504011 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -285,8 +285,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 }
 
 void
-MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
-                                                                         const {
+MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                         RegScavenger *) const {
   // Create a frame entry for the FPW register that must be saved.
   if (hasFP(MF)) {
     int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index a077dd7..c673f59 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -50,7 +50,8 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
   bool hasReservedCallFrame(const MachineFunction &MF) const;
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index ab4e64e..e0ed870 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -73,7 +73,7 @@ namespace llvm {
   public:
     explicit MSP430TargetLowering(MSP430TargetMachine &TM);
 
-    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index ade6084..ebe12c9 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -211,25 +211,30 @@ private:
 
   MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    RegisterKind Kind;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    const MCExpr *Off;
+  };
+
   union {
-    struct {
-      const char *Data;
-      unsigned Length;
-    } Tok;
-
-    struct {
-      unsigned RegNum;
-      RegisterKind Kind;
-    } Reg;
-
-    struct {
-      const MCExpr *Val;
-    } Imm;
-
-    struct {
-      unsigned Base;
-      const MCExpr *Off;
-    } Mem;
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
   };
 
   SMLoc StartLoc, EndLoc;
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index d6fac0c..cf8bb18 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -16,10 +16,13 @@ add_public_tablegen_target(MipsCommonTableGen)
 add_llvm_target(MipsCodeGen
   Mips16FrameLowering.cpp
   Mips16InstrInfo.cpp
+  Mips16ISelDAGToDAG.cpp
+  Mips16ISelLowering.cpp
   Mips16RegisterInfo.cpp
   MipsAnalyzeImmediate.cpp
   MipsAsmPrinter.cpp
   MipsCodeEmitter.cpp
+  MipsConstantIslandPass.cpp
   MipsDelaySlotFiller.cpp
   MipsJITInfo.cpp
   MipsInstrInfo.cpp
@@ -32,6 +35,8 @@ add_llvm_target(MipsCodeGen
   MipsRegisterInfo.cpp
   MipsSEFrameLowering.cpp
   MipsSEInstrInfo.cpp
+  MipsSEISelDAGToDAG.cpp
+  MipsSEISelLowering.cpp
   MipsSERegisterInfo.cpp
   MipsSubtarget.cpp
   MipsTargetMachine.cpp
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 2963f7e..8c65bb4 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -27,6 +27,7 @@ namespace llvm {
   FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
+  FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
 
 } // end namespace llvm;
 
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 1326623..eefb02a 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -44,8 +44,6 @@ def FeatureN64         : SubtargetFeature<"n64", "MipsABI", "N64",
                                 "Enable n64 ABI">;
 def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
                                 "Enable eabi ABI">;
-def FeatureAndroid     : SubtargetFeature<"android", "IsAndroid", "true",
-                                "Target is android">;
 def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
                                 "true", "Enable vector FPU instructions.">;
 def FeatureSEInReg     : SubtargetFeature<"seinreg", "HasSEInReg", "true",
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
new file mode 100644
index 0000000..00b3449
--- /dev/null
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -0,0 +1,308 @@
+//===-- Mips16ISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips16 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+#include "Mips16ISelDAGToDAG.h"
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+/// Select multiply instructions.
+std::pair<SDNode*, SDNode*>
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, DebugLoc DL, EVT Ty,
+                               bool HasLo, bool HasHi) {
+  SDNode *Lo = 0, *Hi = 0;
+  SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
+                                       N->getOperand(1));
+  SDValue InFlag = SDValue(Mul, 0);
+
+  if (HasLo) {
+    unsigned Opcode = Mips::Mflo16;
+    Lo = CurDAG->getMachineNode(Opcode, DL, Ty, MVT::Glue, InFlag);
+    InFlag = SDValue(Lo, 1);
+  }
+  if (HasHi) {
+    unsigned Opcode = Mips::Mfhi16;
+    Hi = CurDAG->getMachineNode(Opcode, DL, Ty, InFlag);
+  }
+  return std::make_pair(Lo, Hi);
+}
+
+void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC =
+    (const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+  V2 = RegInfo.createVirtualRegister(RC);
+
+  BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
+  BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
+    .addReg(V1).addReg(V2);
+}
+
+// Insert instructions to initialize the Mips16 SP Alias register in the
+// first MBB of the function.
+//
+void Mips16DAGToDAGISel::initMips16SPAliasReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->mips16SPAliasRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
+
+  BuildMI(MBB, I, DL, TII.get(Mips::MoveR3216), Mips16SPAliasReg)
+    .addReg(Mips::SP);
+}
+
+void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+  initMips16SPAliasReg(MF);
+}
+
+/// getMips16SPAliasReg - Output the instructions required to put the
+/// SP into a Mips16 accessible aliased register.
+SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
+  unsigned Mips16SPAliasReg =
+    MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
+  return CurDAG->getRegister(Mips16SPAliasReg, TLI.getPointerTy());
+}
+
+void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI.getPointerTy());
+  if (Parent) {
+    switch (Parent->getOpcode()) {
+      case ISD::LOAD: {
+        LoadSDNode *SD = dyn_cast<LoadSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+      case ISD::STORE: {
+        StoreSDNode *SD = dyn_cast<StoreSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+    }
+  }
+  AliasReg = CurDAG->getRegister(Mips::SP, TLI.getPointerTy());
+  return;
+
+}
+
+bool Mips16DAGToDAGISel::selectAddr16(
+  SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset,
+  SDValue &Alias) {
+  EVT ValTy = Addr.getValueType();
+
+  Alias = CurDAG->getTargetConstant(0, ValTy);
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    getMips16SPRefReg(Parent, Alias);
+    return true;
+  }
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0))) {
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+        getMips16SPRefReg(Parent, Alias);
+      }
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+
+    // If an indexed floating point load/store can be emitted, return false.
+    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
+
+    if (LS &&
+        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
+        Subtarget.hasFPIdx())
+      return false;
+  }
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, ValTy);
+  return true;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc DL = Node->getDebugLoc();
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  EVT NodeTy = Node->getValueType(0);
+  unsigned MultOpc;
+
+  switch(Opcode) {
+  default: break;
+
+  case ISD::SUBE:
+  case ISD::ADDE: {
+    SDValue InFlag = Node->getOperand(2), CmpLHS;
+    unsigned Opc = InFlag.getOpcode(); (void)Opc;
+    assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+            (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+           "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+    unsigned MOp;
+    if (Opcode == ISD::ADDE) {
+      CmpLHS = InFlag.getValue(0);
+      MOp = Mips::AdduRxRyRz16;
+    } else {
+      CmpLHS = InFlag.getOperand(0);
+      MOp = Mips::SubuRxRyRz16;
+    }
+
+    SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+
+    EVT VT = LHS.getValueType();
+
+    unsigned Sltu_op = Mips::SltuRxRyRz16;
+    SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops, 2);
+    unsigned Addu_op = Mips::AdduRxRyRz16;
+    SDNode *AddCarry = CurDAG->getMachineNode(Addu_op, DL, VT,
+                                              SDValue(Carry,0), RHS);
+
+    SDNode *Result = CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
+                                          SDValue(AddCarry,0));
+    return std::make_pair(true, Result);
+  }
+
+  /// Mul with two results
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    std::pair<SDNode*, SDNode*> LoHi = selectMULT(Node, MultOpc, DL, NodeTy,
+                                                  true, true);
+    if (!SDValue(Node, 0).use_empty())
+      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
+
+    if (!SDValue(Node, 1).use_empty())
+      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
+
+    return std::make_pair(true, (SDNode*)NULL);
+  }
+
+  case ISD::MULHS:
+  case ISD::MULHU: {
+    MultOpc = (Opcode == ISD::MULHU ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    SDNode *Result = selectMULT(Node, MultOpc, DL, NodeTy, false, true).second;
+    return std::make_pair(true, Result);
+  }
+  }
+
+  return std::make_pair(false, (SDNode*)NULL);
+}
+
+FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM) {
+  return new Mips16DAGToDAGISel(TM);
+}
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
new file mode 100644
index 0000000..baa8587
--- /dev/null
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -0,0 +1,51 @@
+//===---- Mips16ISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16ISELDAGTODAG_H
+#define MIPS16ISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class Mips16DAGToDAGISel : public MipsDAGToDAGISel {
+public:
+  explicit Mips16DAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+
+private:
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc DL,
+                                         EVT Ty, bool HasLo, bool HasHi);
+
+  SDValue getMips16SPAliasReg();
+
+  void getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg);
+
+  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                            SDValue &Offset, SDValue &Alias);
+
+  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+
+  virtual void processFunctionAfterISel(MachineFunction &MF);
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+
+  void initMips16SPAliasReg(MachineFunction &MF);
+};
+
+FunctionPass *createMips16ISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
new file mode 100644
index 0000000..23eb537
--- /dev/null
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -0,0 +1,689 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "mips-lower"
+#include "Mips16ISelLowering.h"
+#include "MipsRegisterInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <set>
+
+using namespace llvm;
+
+static cl::opt<bool>
+Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                cl::desc("MIPS: mips16 hard float enable."),
+                cl::init(false));
+
+static cl::opt<bool> DontExpandCondPseudos16(
+  "mips16-dont-expand-cond-pseudo",
+  cl::init(false),
+  cl::desc("Dont expand conditional move related "
+           "pseudos for Mips 16"),
+  cl::Hidden);
+
+namespace {
+  std::set<const char*, MipsTargetLowering::LTStr> NoHelperNeeded;
+}
+
+Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
+  : MipsTargetLowering(TM) {
+  //
+  // set up as if mips32 and then revert so we can test the mechanism
+  // for switching
+  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
+  addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
+  computeRegisterProperties();
+  clearRegisterClasses();
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+
+  if (Mips16HardFloat)
+    setMips16HardFloatLibCalls();
+
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_SWAP,        MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,     MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX,   MVT::i32,   Expand);
+
+  computeRegisterProperties();
+}
+
+const MipsTargetLowering *
+llvm::createMips16TargetLowering(MipsTargetMachine &TM) {
+  return new Mips16TargetLowering(TM);
+}
+
+bool
+Mips16TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  return false;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::SelBeqZ:
+    return emitSel16(Mips::BeqzRxImm16, MI, BB);
+  case Mips::SelBneZ:
+    return emitSel16(Mips::BnezRxImm16, MI, BB);
+  case Mips::SelTBteqZCmpi:
+    return emitSeliT16(Mips::BteqzX16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSlti:
+    return emitSeliT16(Mips::BteqzX16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSltiu:
+    return emitSeliT16(Mips::BteqzX16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBtneZCmpi:
+    return emitSeliT16(Mips::BtnezX16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSlti:
+    return emitSeliT16(Mips::BtnezX16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSltiu:
+    return emitSeliT16(Mips::BtnezX16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBteqZCmp:
+    return emitSelT16(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBteqZSlt:
+    return emitSelT16(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBteqZSltu:
+    return emitSelT16(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::SelTBtneZCmp:
+    return emitSelT16(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBtneZSlt:
+    return emitSelT16(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBtneZSltu:
+    return emitSelT16(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BteqzT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+  case Mips::BteqzT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BtnezT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BtnezT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+  case Mips::BtnezT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+    break;
+  case Mips::SltCCRxRy16:
+    return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
+    break;
+  case Mips::SltiCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SltiuCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SltuCCRxRy16:
+    return emitFEXT_CCRX16_ins
+      (Mips::SltuRxRy16, MI, BB);
+  }
+}
+
+bool Mips16TargetLowering::
+isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                  unsigned NextStackOffset,
+                                  const MipsFunctionInfo& FI) const {
+  // No tail call optimization for mips16.
+  return false;
+}
+
+void Mips16TargetLowering::setMips16LibcallName
+  (RTLIB::Libcall L, const char *Name) {
+  setLibcallName(L, Name);
+  NoHelperNeeded.insert(Name);
+}
+
+void Mips16TargetLowering::setMips16HardFloatLibCalls() {
+  setMips16LibcallName(RTLIB::ADD_F32, "__mips16_addsf3");
+  setMips16LibcallName(RTLIB::ADD_F64, "__mips16_adddf3");
+  setMips16LibcallName(RTLIB::SUB_F32, "__mips16_subsf3");
+  setMips16LibcallName(RTLIB::SUB_F64, "__mips16_subdf3");
+  setMips16LibcallName(RTLIB::MUL_F32, "__mips16_mulsf3");
+  setMips16LibcallName(RTLIB::MUL_F64, "__mips16_muldf3");
+  setMips16LibcallName(RTLIB::DIV_F32, "__mips16_divsf3");
+  setMips16LibcallName(RTLIB::DIV_F64, "__mips16_divdf3");
+  setMips16LibcallName(RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2");
+  setMips16LibcallName(RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2");
+  setMips16LibcallName(RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi");
+  setMips16LibcallName(RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi");
+  setMips16LibcallName(RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf");
+  setMips16LibcallName(RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf");
+  setMips16LibcallName(RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf");
+  setMips16LibcallName(RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf");
+  setMips16LibcallName(RTLIB::OEQ_F32, "__mips16_eqsf2");
+  setMips16LibcallName(RTLIB::OEQ_F64, "__mips16_eqdf2");
+  setMips16LibcallName(RTLIB::UNE_F32, "__mips16_nesf2");
+  setMips16LibcallName(RTLIB::UNE_F64, "__mips16_nedf2");
+  setMips16LibcallName(RTLIB::OGE_F32, "__mips16_gesf2");
+  setMips16LibcallName(RTLIB::OGE_F64, "__mips16_gedf2");
+  setMips16LibcallName(RTLIB::OLT_F32, "__mips16_ltsf2");
+  setMips16LibcallName(RTLIB::OLT_F64, "__mips16_ltdf2");
+  setMips16LibcallName(RTLIB::OLE_F32, "__mips16_lesf2");
+  setMips16LibcallName(RTLIB::OLE_F64, "__mips16_ledf2");
+  setMips16LibcallName(RTLIB::OGT_F32, "__mips16_gtsf2");
+  setMips16LibcallName(RTLIB::OGT_F64, "__mips16_gtdf2");
+  setMips16LibcallName(RTLIB::UO_F32, "__mips16_unordsf2");
+  setMips16LibcallName(RTLIB::UO_F64, "__mips16_unorddf2");
+  setMips16LibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+  setMips16LibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+}
+
+
+//
+// The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
+// cleaner way to do all of this but it will have to wait until the traditional
+// gcc mechanism is completed.
+//
+// For Pic, in order for Mips16 code to call Mips32 code which according the abi
+// have either arguments or returned values placed in floating point registers,
+// we use a set of helper functions. (This includes functions which return type
+//  complex which on Mips are returned in a pair of floating point registers).
+//
+// This is an encoding that we inherited from gcc.
+// In Mips traditional O32, N32 ABI, floating point numbers are passed in
+// floating point argument registers 1,2 only when the first and optionally
+// the second arguments are float (sf) or double (df).
+// For Mips16 we are only concerned with the situations where floating point
+// arguments are being passed in floating point registers by the ABI, because
+// Mips16 mode code cannot execute floating point instructions to load those
+// values and hence helper functions are needed.
+// The possibilities are (), (sf), (sf, sf), (sf, df), (df), (df, sf), (df, df)
+// the helper function suffixs for these are:
+//                        0,  1,    5,        9,         2,   6,        10
+// this suffix can then be calculated as follows:
+// for a given argument Arg:
+//     Arg1x, Arg2x = 1 :  Arg is sf
+//                    2 :  Arg is df
+//                    0:   Arg is neither sf or df
+// So this stub is the string for number Arg1x + Arg2x*4.
+// However not all numbers between 0 and 10 are possible, we check anyway and
+// assert if the impossible exists.
+//
+
+unsigned int Mips16TargetLowering::getMips16HelperFunctionStubNumber
+  (ArgListTy &Args) const {
+  unsigned int resultNum = 0;
+  if (Args.size() >= 1) {
+    Type *t = Args[0].Ty;
+    if (t->isFloatTy()) {
+      resultNum = 1;
+    }
+    else if (t->isDoubleTy()) {
+      resultNum = 2;
+    }
+  }
+  if (resultNum) {
+    if (Args.size() >=2) {
+      Type *t = Args[1].Ty;
+      if (t->isFloatTy()) {
+        resultNum += 4;
+      }
+      else if (t->isDoubleTy()) {
+        resultNum += 8;
+      }
+    }
+  }
+  return resultNum;
+}
+
+//
+// prefixs are attached to stub numbers depending on the return type .
+// return type: float  sf_
+//              double df_
+//              single complex sc_
+//              double complext dc_
+//              others  NO PREFIX
+//
+//
+// The full name of a helper function is__mips16_call_stub +
+//    return type dependent prefix + stub number
+//
+//
+// This is something that probably should be in a different source file and
+// perhaps done differently but my main purpose is to not waste runtime
+// on something that we can enumerate in the source. Another possibility is
+// to have a python script to generate these mapping tables. This will do
+// for now. There are a whole series of helper function mapping arrays, one
+// for each return type class as outlined above. There there are 11 possible
+//  entries. Ones with 0 are ones which should never be selected
+//
+// All the arrays are similar except for ones which return neither
+// sf, df, sc, dc, in which only care about ones which have sf or df as a
+// first parameter.
+//
+#define P_ "__mips16_call_stub_"
+#define MAX_STUB_NUMBER 10
+#define T1 P "1", P "2", 0, 0, P "5", P "6", 0, 0, P "9", P "10"
+#define T P "0" , T1
+#define P P_
+static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
+  {0, T1 };
+#undef P
+#define P P_ "sf_"
+static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "df_"
+static char const * dfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "sc_"
+static char const * scMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "dc_"
+static char const * dcMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#undef P_
+
+
+const char* Mips16TargetLowering::
+  getMips16HelperFunction
+    (Type* RetTy, ArgListTy &Args, bool &needHelper) const {
+  const unsigned int stubNum = getMips16HelperFunctionStubNumber(Args);
+#ifndef NDEBUG
+  const unsigned int maxStubNum = 10;
+  assert(stubNum <= maxStubNum);
+  const bool validStubNum[maxStubNum+1] =
+    {true, true, true, false, false, true, true, false, false, true, true};
+  assert(validStubNum[stubNum]);
+#endif
+  const char *result;
+  if (RetTy->isFloatTy()) {
+    result = sfMips16Helper[stubNum];
+  }
+  else if (RetTy ->isDoubleTy()) {
+    result = dfMips16Helper[stubNum];
+  }
+  else if (RetTy->isStructTy()) {
+    // check if it's complex
+    if (RetTy->getNumContainedTypes() == 2) {
+      if ((RetTy->getContainedType(0)->isFloatTy()) &&
+          (RetTy->getContainedType(1)->isFloatTy())) {
+        result = scMips16Helper[stubNum];
+      }
+      else if ((RetTy->getContainedType(0)->isDoubleTy()) &&
+               (RetTy->getContainedType(1)->isDoubleTy())) {
+        result = dcMips16Helper[stubNum];
+      }
+      else {
+        llvm_unreachable("Uncovered condition");
+      }
+    }
+    else {
+      llvm_unreachable("Uncovered condition");
+    }
+  }
+  else {
+    if (stubNum == 0) {
+      needHelper = false;
+      return "";
+    }
+    result = vMips16Helper[stubNum];
+  }
+  needHelper = true;
+  return result;
+}
+
+void Mips16TargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  SelectionDAG &DAG = CLI.DAG;
+  const char* Mips16HelperFunction = 0;
+  bool NeedMips16Helper = false;
+
+  if (getTargetMachine().Options.UseSoftFloat && Mips16HardFloat) {
+    //
+    // currently we don't have symbols tagged with the mips16 or mips32
+    // qualifier so we will assume that we don't know what kind it is.
+    // and generate the helper
+    //
+    bool LookupHelper = true;
+    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
+      if (NoHelperNeeded.find(S->getSymbol()) != NoHelperNeeded.end()) {
+        LookupHelper = false;
+      }
+    }
+    if (LookupHelper) Mips16HelperFunction =
+      getMips16HelperFunction(CLI.RetTy, CLI.Args, NeedMips16Helper);
+
+  }
+
+  SDValue JumpTarget = Callee;
+
+  // T9 should contain the address of the callee function if
+  // -reloction-model=pic or it is an indirect call.
+  if (IsPICCall || !GlobalOrExternal) {
+    unsigned V0Reg = Mips::V0;
+    if (NeedMips16Helper) {
+      RegsToPass.push_front(std::make_pair(V0Reg, Callee));
+      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
+      JumpTarget = getAddrGlobal(JumpTarget, DAG, MipsII::MO_GOT);
+    } else
+      RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
+  }
+
+  Ops.push_back(JumpTarget);
+
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, CLI, Callee, Chain);
+}
+
+MachineBasicBlock *Mips16TargetLowering::
+emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc)).addReg(MI->getOperand(3).getReg())
+    .addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitSelT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addReg(MI->getOperand(4).getReg());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitSeliT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addImm(MI->getOperand(4).getImm());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock
+  *Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                             MachineInstr *MI,
+                                             MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  unsigned regY = MI->getOperand(1).getReg();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
+  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  int64_t imm = MI->getOperand(1).getImm();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  unsigned CmpOpc;
+  if (isUInt<8>(imm))
+    CmpOpc = CmpiOpc;
+  else if (isUInt<16>(imm))
+    CmpOpc = CmpiXOpc;
+  else
+    llvm_unreachable("immediate field not usable");
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addImm(imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+static unsigned Mips16WhichOp8uOr16simm
+  (unsigned shortOp, unsigned longOp, int64_t Imm) {
+  if (isUInt<8>(Imm))
+    return shortOp;
+  else if (isInt<16>(Imm))
+    return longOp;
+  else
+    llvm_unreachable("immediate field not usable");
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRX16_ins(
+  unsigned SltOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  unsigned regY = MI->getOperand(2).getReg();
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+		  TII->get(SltOpc)).addReg(regX).addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRXI16_ins(
+  unsigned SltiOpc, unsigned SltiXOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB )const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  int64_t Imm = MI->getOperand(2).getImm();
+  unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(SltOpc)).addReg(regX).addImm(Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
new file mode 100644
index 0000000..b23e2a1
--- /dev/null
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -0,0 +1,80 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Mips16ISELLOWERING_H
+#define Mips16ISELLOWERING_H
+
+#include "MipsISelLowering.h"
+
+namespace llvm {
+  class Mips16TargetLowering : public MipsTargetLowering  {
+  public:
+    explicit Mips16TargetLowering(MipsTargetMachine &TM);
+
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+
+    virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  private:
+    virtual bool
+    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo& FI) const;
+
+    void setMips16LibcallName(RTLIB::Libcall, const char *Name);
+
+    void setMips16HardFloatLibCalls();
+
+    unsigned int
+      getMips16HelperFunctionStubNumber(ArgListTy &Args) const;
+
+    const char *getMips16HelperFunction
+      (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
+
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+
+    MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr *MI,
+                                 MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSeliT16(unsigned Opc1, unsigned Opc2,
+                                   MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSelT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                           MachineInstr *MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I8I16_ins(
+      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRX16_ins(
+      unsigned SltOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRXI16_ins(
+      unsigned SltiOpc, unsigned SltiXOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB )const;
+  };
+}
+
+#endif // Mips16ISELLOWERING_H
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index a9e9c52..6293829 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -15,7 +15,7 @@
 // Mips Address
 //
 def addr16 :
-  ComplexPattern<iPTR, 3, "SelectAddr16", [frameindex], [SDNPWantParent]>;
+  ComplexPattern<iPTR, 3, "selectAddr16", [frameindex], [SDNPWantParent]>;
 
 //
 // Address operand
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 494ba87..5903b9e 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -313,7 +313,7 @@ def : InstAlias<"move $dst, $src",
                 (DADDu CPU64RegsOpnd:$dst,  CPU64RegsOpnd:$src, ZERO_64), 1>,
       Requires<[HasMips64]>;
 def : InstAlias<"move $dst, $src",
-                (OR64 CPU64RegsOpnd:$dst, CPU64RegsOpnd:$src, ZERO_64), 0>,
+                (OR64 CPU64RegsOpnd:$dst, CPU64RegsOpnd:$src, ZERO_64), 1>,
       Requires<[HasMips64]>;
 def : InstAlias<"and $rs, $rt, $imm",
                 (DANDi CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 78cf140..462def7 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -96,6 +96,12 @@ def RetCC_MipsN : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
 ]>;
 
+// In soft-mode, register A0_64, instead of V1_64, is used to return a long
+// double value.
+def RetCC_F128Soft : CallingConv<[
+  CCIfType<[i64], CCAssignToReg<[V0_64, A0_64]>>
+]>;
+
 //===----------------------------------------------------------------------===//
 // Mips EABI Calling Convention
 //===----------------------------------------------------------------------===//
@@ -139,17 +145,6 @@ def RetCC_MipsEABI : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
-// Mips Android Calling Convention
-//===----------------------------------------------------------------------===//
-
-def RetCC_MipsAndroid : CallingConv<[
-  // f32 are returned in registers F0, F2, F1, F3
-  CCIfType<[f32], CCAssignToReg<[F0, F2, F1, F3]>>,
-
-  CCDelegateTo<RetCC_MipsO32>
-]>;
-
-//===----------------------------------------------------------------------===//
 // Mips FastCC Calling Convention
 //===----------------------------------------------------------------------===//
 def CC_MipsO32_FastCC : CallingConv<[
@@ -209,7 +204,6 @@ def RetCC_Mips : CallingConv<[
   CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
   CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
   CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
-  CCIfSubtarget<"isAndroid()", CCDelegateTo<RetCC_MipsAndroid>>,
   CCDelegateTo<RetCC_MipsO32>
 ]>;
 
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 559370b..42e4c99 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -58,21 +58,23 @@ multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC,
                      Instruction SLTiuOp> {
   def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
                 (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : MipsPat<
-          (select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-          (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : MipsPat<
-          (select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
-          (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
-  def : MipsPat<
-          (select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
-          (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
-  def : MipsPat<
-          (select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-          (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
-  def : MipsPat<
-          (select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-          (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
+  def : MipsPat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setgt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, (Plus1 imm:$rhs)), DRC:$F)>;
+  def : MipsPat<(select (i32 (setugt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lhs, (Plus1 imm:$rhs)),
+                          DRC:$F)>;
 }
 
 multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC,
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
new file mode 100644
index 0000000..b5de1eb
--- /dev/null
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -0,0 +1,85 @@
+//===-- MipsConstantIslandPass.cpp - Emit Pc Relative loads----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// This pass is used to make Pc relative loads of constants.
+// For now, only Mips16 will use this. While it has the same name and
+// uses many ideas from the LLVM ARM Constant Island Pass, it's not intended
+// to reuse any of the code from the ARM version.
+//
+// Loading constants inline is expensive on Mips16 and it's in general better
+// to place the constant nearby in code space and then it can be loaded with a
+// simple 16 bit load instruction.
+//
+// The constants can be not just numbers but addresses of functions and labels.
+// This can be particularly helpful in static relocation mode for embedded
+// non linux targets.
+//
+//
+
+#define DEBUG_TYPE "mips-constant-islands"
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  class MipsConstantIslands : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    MipsConstantIslands(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()) {}
+
+    virtual const char *getPassName() const {
+      return "Mips Constant Islands";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+
+
+    const TargetMachine &TM;
+    const MipsInstrInfo *TII;
+    bool IsPIC;
+    unsigned ABI;
+
+  };
+
+  char MipsConstantIslands::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsConstantIslandPass(MipsTargetMachine &tm) {
+  return new MipsConstantIslands(tm);
+}
+
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &F) {
+  return true;
+}
+
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index d62b166..e265590 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -14,11 +14,17 @@
 #define DEBUG_TYPE "delay-slot-filler"
 
 #include "Mips.h"
+#include "MipsInstrInfo.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -36,18 +42,59 @@ static cl::opt<bool> DisableDelaySlotFiller(
   cl::desc("Fill all delay slots with NOPs."),
   cl::Hidden);
 
-// This option can be used to silence complaints by machine verifier passes.
-static cl::opt<bool> SkipDelaySlotFiller(
-  "skip-mips-delay-filler",
+static cl::opt<bool> DisableForwardSearch(
+  "disable-mips-df-forward-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search forward."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableSuccBBSearch(
+  "disable-mips-df-succbb-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search successor basic blocks."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableBackwardSearch(
+  "disable-mips-df-backward-search",
   cl::init(false),
-  cl::desc("Skip MIPS' delay slot filling pass."),
+  cl::desc("Disallow MIPS delay filler to search backward."),
   cl::Hidden);
 
 namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+  typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
+
+  /// \brief A functor comparing edge weight of two blocks.
+  struct CmpWeight {
+    CmpWeight(const MachineBasicBlock &S,
+              const MachineBranchProbabilityInfo &P) : Src(S), Prob(P) {}
+
+    bool operator()(const MachineBasicBlock *Dst0,
+                    const MachineBasicBlock *Dst1) const {
+      return Prob.getEdgeWeight(&Src, Dst0) < Prob.getEdgeWeight(&Src, Dst1);
+    }
+
+    const MachineBasicBlock &Src;
+    const MachineBranchProbabilityInfo &Prob;
+  };
+
   class RegDefsUses {
   public:
     RegDefsUses(TargetMachine &TM);
     void init(const MachineInstr &MI);
+
+    /// This function sets all caller-saved registers in Defs.
+    void setCallerSaved(const MachineInstr &MI);
+
+    /// This function sets all unallocatable registers in Defs.
+    void setUnallocatableRegs(const MachineFunction &MF);
+
+    /// Set bits in Uses corresponding to MBB's live-out registers except for
+    /// the registers that are live-in to SuccBB.
+    void addLiveOut(const MachineBasicBlock &MBB,
+                    const MachineBasicBlock &SuccBB);
+
     bool update(const MachineInstr &MI, unsigned Begin, unsigned End);
 
   private:
@@ -61,6 +108,72 @@ namespace {
     BitVector Defs, Uses;
   };
 
+  /// Base class for inspecting loads and stores.
+  class InspectMemInstr {
+  public:
+    InspectMemInstr(bool ForbidMemInstr_)
+      : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
+        SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+
+    /// Return true if MI cannot be moved to delay slot.
+    bool hasHazard(const MachineInstr &MI);
+
+    virtual ~InspectMemInstr() {}
+
+  protected:
+    /// Flags indicating whether loads or stores have been seen.
+    bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+
+    /// Memory instructions are not allowed to move to delay slot if this flag
+    /// is true.
+    bool ForbidMemInstr;
+
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI) = 0;
+  };
+
+  /// This subclass rejects any memory instructions.
+  class NoMemInstr : public InspectMemInstr {
+  public:
+    NoMemInstr() : InspectMemInstr(true) {}
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI) { return true; }
+  };
+
+  /// This subclass accepts loads from stacks and constant loads.
+  class LoadFromStackOrConst : public InspectMemInstr {
+  public:
+    LoadFromStackOrConst() : InspectMemInstr(false) {}
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI);
+  };
+
+  /// This subclass uses memory dependence information to determine whether a
+  /// memory instruction can be moved to a delay slot.
+  class MemDefsUses : public InspectMemInstr {
+  public:
+    MemDefsUses(const MachineFrameInfo *MFI);
+
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI);
+
+    /// Update Defs and Uses. Return true if there exist dependences that
+    /// disqualify the delay slot candidate between V and values in Uses and
+    /// Defs.
+    bool updateDefsUses(const Value *V, bool MayStore);
+
+    /// Get the list of underlying objects of MI's memory operand.
+    bool getUnderlyingObjects(const MachineInstr &MI,
+                              SmallVectorImpl<const Value *> &Objects) const;
+
+    const MachineFrameInfo *MFI;
+    SmallPtrSet<const Value*, 4> Uses, Defs;
+
+    /// Flags indicating whether loads or stores with no underlying objects have
+    /// been seen.
+    bool SeenNoObjLoad, SeenNoObjStore;
+  };
+
   class Filler : public MachineFunctionPass {
   public:
     Filler(TargetMachine &tm)
@@ -71,9 +184,6 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &F) {
-      if (SkipDelaySlotFiller)
-        return false;
-
       bool Changed = false;
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
@@ -81,19 +191,54 @@ namespace {
       return Changed;
     }
 
-  private:
-    typedef MachineBasicBlock::iterator Iter;
-    typedef MachineBasicBlock::reverse_iterator ReverseIter;
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
 
+  private:
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
 
     /// This function checks if it is valid to move Candidate to the delay slot
-    /// and returns true if it isn't. It also updates load and store flags and
-    /// register defs and uses.
-    bool delayHasHazard(const MachineInstr &Candidate, bool &SawLoad,
-                        bool &SawStore, RegDefsUses &RegDU) const;
-
-    bool findDelayInstr(MachineBasicBlock &MBB, Iter slot, Iter &Filler) const;
+    /// and returns true if it isn't. It also updates memory and register
+    /// dependence information.
+    bool delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                        InspectMemInstr &IM) const;
+
+    /// This function searches range [Begin, End) for an instruction that can be
+    /// moved to the delay slot. Returns true on success.
+    template<typename IterTy>
+    bool searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                     RegDefsUses &RegDU, InspectMemInstr &IM,
+                     IterTy &Filler) const;
+
+    /// This function searches in the backward direction for an instruction that
+    /// can be moved to the delay slot. Returns true on success.
+    bool searchBackward(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// This function searches MBB in the forward direction for an instruction
+    /// that can be moved to the delay slot. Returns true on success.
+    bool searchForward(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// This function searches MBB's successor blocks for an instruction that
+    /// can be moved to the delay slot and inserts clones of the instruction
+    /// into the successor blocks.
+    bool searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// Pick a successor block of MBB. Return NULL if MBB doesn't have a
+    /// successor block that is not a landing pad.
+    MachineBasicBlock *selectSuccBB(MachineBasicBlock &B) const;
+
+    /// This function analyzes MBB and returns an instruction with an unoccupied
+    /// slot that branches to Dst.
+    std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+    getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const;
+
+    /// Examine Pred and see if it is possible to insert an instruction into
+    /// one of its branches delay slot or its end.
+    bool examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                     RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                     BB2BrMap &BrMap) const;
 
     bool terminateSearch(const MachineInstr &Candidate) const;
 
@@ -105,6 +250,45 @@ namespace {
   char Filler::ID = 0;
 } // end of anonymous namespace
 
+static bool hasUnoccupiedSlot(const MachineInstr *MI) {
+  return MI->hasDelaySlot() && !MI->isBundledWithSucc();
+}
+
+/// This function inserts clones of Filler into predecessor blocks.
+static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
+  MachineFunction *MF = Filler->getParent()->getParent();
+
+  for (BB2BrMap::const_iterator I = BrMap.begin(); I != BrMap.end(); ++I) {
+    if (I->second) {
+      MIBundleBuilder(I->second).append(MF->CloneMachineInstr(&*Filler));
+      ++UsefulSlots;
+    } else {
+      I->first->insert(I->first->end(), MF->CloneMachineInstr(&*Filler));
+    }
+  }
+}
+
+/// This function adds registers Filler defines to MBB's live-in register list.
+static void addLiveInRegs(Iter Filler, MachineBasicBlock &MBB) {
+  for (unsigned I = 0, E = Filler->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = Filler->getOperand(I);
+    unsigned R;
+
+    if (!MO.isReg() || !MO.isDef() || !(R = MO.getReg()))
+      continue;
+
+#ifndef NDEBUG
+    const MachineFunction &MF = *MBB.getParent();
+    assert(MF.getTarget().getRegisterInfo()->getAllocatableSet(MF).test(R) &&
+           "Shouldn't move an instruction with unallocatable registers across "
+           "basic block boundaries.");
+#endif
+
+    if (!MBB.isLiveIn(R))
+      MBB.addLiveIn(R);
+  }
+}
+
 RegDefsUses::RegDefsUses(TargetMachine &TM)
   : TRI(*TM.getRegisterInfo()), Defs(TRI.getNumRegs(), false),
     Uses(TRI.getNumRegs(), false) {}
@@ -126,6 +310,45 @@ void RegDefsUses::init(const MachineInstr &MI) {
   }
 }
 
+void RegDefsUses::setCallerSaved(const MachineInstr &MI) {
+  assert(MI.isCall());
+
+  // If MI is a call, add all caller-saved registers to Defs.
+  BitVector CallerSavedRegs(TRI.getNumRegs(), true);
+
+  CallerSavedRegs.reset(Mips::ZERO);
+  CallerSavedRegs.reset(Mips::ZERO_64);
+
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(); *R; ++R)
+    for (MCRegAliasIterator AI(*R, &TRI, true); AI.isValid(); ++AI)
+      CallerSavedRegs.reset(*AI);
+
+  Defs |= CallerSavedRegs;
+}
+
+void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
+  BitVector AllocSet = TRI.getAllocatableSet(MF);
+
+  for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R))
+    for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
+      AllocSet.set(*AI);
+
+  AllocSet.set(Mips::ZERO);
+  AllocSet.set(Mips::ZERO_64);
+
+  Defs |= AllocSet.flip();
+}
+
+void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
+                             const MachineBasicBlock &SuccBB) {
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+       SE = MBB.succ_end(); SI != SE; ++SI)
+    if (*SI != &SuccBB)
+      for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
+           LE = (*SI)->livein_end(); LI != LE; ++LI)
+        Uses.set(*LI);
+}
+
 bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
   BitVector NewDefs(TRI.getNumRegs()), NewUses(TRI.getNumRegs());
   bool HasHazard = false;
@@ -164,28 +387,134 @@ bool RegDefsUses::isRegInSet(const BitVector &RegSet, unsigned Reg) const {
   return false;
 }
 
+bool InspectMemInstr::hasHazard(const MachineInstr &MI) {
+  if (!MI.mayStore() && !MI.mayLoad())
+    return false;
+
+  if (ForbidMemInstr)
+    return true;
+
+  OrigSeenLoad = SeenLoad;
+  OrigSeenStore = SeenStore;
+  SeenLoad |= MI.mayLoad();
+  SeenStore |= MI.mayStore();
+
+  // If MI is an ordered or volatile memory reference, disallow moving
+  // subsequent loads and stores to delay slot.
+  if (MI.hasOrderedMemoryRef() && (OrigSeenLoad || OrigSeenStore)) {
+    ForbidMemInstr = true;
+    return true;
+  }
+
+  return hasHazard_(MI);
+}
+
+bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
+  if (MI.mayStore())
+    return true;
+
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+    return true;
+
+  const Value *V = (*MI.memoperands_begin())->getValue();
+
+  if (isa<FixedStackPseudoSourceValue>(V))
+    return false;
+
+  if (const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V))
+    return !PSV->PseudoSourceValue::isConstant(0) &&
+      (V != PseudoSourceValue::getStack());
+
+  return true;
+}
+
+MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_)
+  : InspectMemInstr(false), MFI(MFI_), SeenNoObjLoad(false),
+    SeenNoObjStore(false) {}
+
+bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
+  bool HasHazard = false;
+  SmallVector<const Value *, 4> Objs;
+
+  // Check underlying object list.
+  if (getUnderlyingObjects(MI, Objs)) {
+    for (SmallVector<const Value *, 4>::const_iterator I = Objs.begin();
+         I != Objs.end(); ++I)
+      HasHazard |= updateDefsUses(*I, MI.mayStore());
+
+    return HasHazard;
+  }
+
+  // No underlying objects found.
+  HasHazard = MI.mayStore() && (OrigSeenLoad || OrigSeenStore);
+  HasHazard |= MI.mayLoad() || OrigSeenStore;
+
+  SeenNoObjLoad |= MI.mayLoad();
+  SeenNoObjStore |= MI.mayStore();
+
+  return HasHazard;
+}
+
+bool MemDefsUses::updateDefsUses(const Value *V, bool MayStore) {
+  if (MayStore)
+    return !Defs.insert(V) || Uses.count(V) || SeenNoObjStore || SeenNoObjLoad;
+
+  Uses.insert(V);
+  return Defs.count(V) || SeenNoObjStore;
+}
+
+bool MemDefsUses::
+getUnderlyingObjects(const MachineInstr &MI,
+                     SmallVectorImpl<const Value *> &Objects) const {
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+    return false;
+
+  const Value *V = (*MI.memoperands_begin())->getValue();
+
+  SmallVector<Value *, 4> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(V), Objs);
+
+  for (SmallVector<Value*, 4>::iterator I = Objs.begin(), E = Objs.end();
+       I != E; ++I) {
+    if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(*I)) {
+      if (PSV->isAliased(MFI))
+        return false;
+    } else if (!isIdentifiedObject(V))
+      return false;
+
+    Objects.push_back(*I);
+  }
+
+  return true;
+}
+
 /// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
 /// We assume there is only one delay slot per delayed instruction.
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
   for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
-    if (!I->hasDelaySlot())
+    if (!hasUnoccupiedSlot(&*I))
       continue;
 
     ++FilledSlots;
     Changed = true;
-    Iter D;
 
     // Delay slot filling is disabled at -O0.
-    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None) &&
-        findDelayInstr(MBB, I, D)) {
-      MBB.splice(llvm::next(I), &MBB, D);
-      ++UsefulSlots;
-    } else
-      BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+      if (searchBackward(MBB, I))
+        continue;
+
+      if (I->isTerminator()) {
+        if (searchSuccBBs(MBB, I))
+          continue;
+      } else if (searchForward(MBB, I)) {
+        continue;
+      }
+    }
 
-    // Bundle the delay slot filler to the instruction with the delay slot.
+    // Bundle the NOP to the instruction with the delay slot.
+    BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
     MIBundleBuilder(MBB, I, llvm::next(llvm::next(I)));
   }
 
@@ -198,16 +527,11 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
   return new Filler(tm);
 }
 
-bool Filler::findDelayInstr(MachineBasicBlock &MBB, Iter Slot,
-                            Iter &Filler) const {
-  RegDefsUses RegDU(TM);
-
-  RegDU.init(*Slot);
-
-  bool SawLoad = false;
-  bool SawStore = false;
-
-  for (ReverseIter I(Slot); I != MBB.rend(); ++I) {
+template<typename IterTy>
+bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                         RegDefsUses &RegDU, InspectMemInstr& IM,
+                         IterTy &Filler) const {
+  for (IterTy I = Begin; I != End; ++I) {
     // skip debug value
     if (I->isDebugValue())
       continue;
@@ -215,33 +539,176 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB, Iter Slot,
     if (terminateSearch(*I))
       break;
 
-    if (delayHasHazard(*I, SawLoad, SawStore, RegDU))
+    assert((!I->isCall() && !I->isReturn() && !I->isBranch()) &&
+           "Cannot put calls, returns or branches in delay slot.");
+
+    if (delayHasHazard(*I, RegDU, IM))
       continue;
 
-    Filler = llvm::next(I).base();
+    Filler = I;
     return true;
   }
 
   return false;
 }
 
-bool Filler::delayHasHazard(const MachineInstr &Candidate, bool &SawLoad,
-                            bool &SawStore, RegDefsUses &RegDU) const {
-  bool HasHazard = (Candidate.isImplicitDef() || Candidate.isKill());
+bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
+  if (DisableBackwardSearch)
+    return false;
+
+  RegDefsUses RegDU(TM);
+  MemDefsUses MemDU(MBB.getParent()->getFrameInfo());
+  ReverseIter Filler;
+
+  RegDU.init(*Slot);
 
-  // Loads or stores cannot be moved past a store to the delay slot
-  // and stores cannot be moved past a load.
-  if (Candidate.mayStore() || Candidate.hasOrderedMemoryRef()) {
-    HasHazard |= SawStore | SawLoad;
-    SawStore = true;
-  } else if (Candidate.mayLoad()) {
-    HasHazard |= SawStore;
-    SawLoad = true;
+  if (searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler)) {
+    MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
+    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+    ++UsefulSlots;
+    return true;
   }
 
-  assert((!Candidate.isCall() && !Candidate.isReturn()) &&
-         "Cannot put calls or returns in delay slot.");
+  return false;
+}
+
+bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
+  // Can handle only calls.
+  if (DisableForwardSearch || !Slot->isCall())
+    return false;
+
+  RegDefsUses RegDU(TM);
+  NoMemInstr NM;
+  Iter Filler;
+
+  RegDU.setCallerSaved(*Slot);
+
+  if (searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler)) {
+    MBB.splice(llvm::next(Slot), &MBB, Filler);
+    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+    ++UsefulSlots;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
+  if (DisableSuccBBSearch)
+    return false;
+
+  MachineBasicBlock *SuccBB = selectSuccBB(MBB);
+
+  if (!SuccBB)
+    return false;
+
+  RegDefsUses RegDU(TM);
+  bool HasMultipleSuccs = false;
+  BB2BrMap BrMap;
+  OwningPtr<InspectMemInstr> IM;
+  Iter Filler;
+
+  // Iterate over SuccBB's predecessor list.
+  for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
+       PE = SuccBB->pred_end(); PI != PE; ++PI)
+    if (!examinePred(**PI, *SuccBB, RegDU, HasMultipleSuccs, BrMap))
+      return false;
+
+  // Do not allow moving instructions which have unallocatable register operands
+  // across basic block boundaries.
+  RegDU.setUnallocatableRegs(*MBB.getParent());
+
+  // Only allow moving loads from stack or constants if any of the SuccBB's
+  // predecessors have multiple successors.
+  if (HasMultipleSuccs) {
+    IM.reset(new LoadFromStackOrConst());
+  } else {
+    const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo();
+    IM.reset(new MemDefsUses(MFI));
+  }
+
+  if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Filler))
+    return false;
+
+  insertDelayFiller(Filler, BrMap);
+  addLiveInRegs(Filler, *SuccBB);
+  Filler->eraseFromParent();
+
+  return true;
+}
+
+MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
+  if (B.succ_empty())
+    return NULL;
+
+  // Select the successor with the larget edge weight.
+  CmpWeight Cmp(B, getAnalysis<MachineBranchProbabilityInfo>());
+  MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(), Cmp);
+  return S->isLandingPad() ? NULL : S;
+}
+
+std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+  MachineBasicBlock *TrueBB = 0, *FalseBB = 0;
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  SmallVector<MachineOperand, 2> Cond;
+
+  MipsInstrInfo::BranchType R =
+    TII->AnalyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
+
+  if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
+    return std::make_pair(R, (MachineInstr*)NULL);
+
+  if (R != MipsInstrInfo::BT_CondUncond) {
+    if (!hasUnoccupiedSlot(BranchInstrs[0]))
+      return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+
+    assert(((R != MipsInstrInfo::BT_Uncond) || (TrueBB == &Dst)));
+
+    return std::make_pair(R, BranchInstrs[0]);
+  }
+
+  assert((TrueBB == &Dst) || (FalseBB == &Dst));
+
+  // Examine the conditional branch. See if its slot is occupied.
+  if (hasUnoccupiedSlot(BranchInstrs[0]))
+    return std::make_pair(MipsInstrInfo::BT_Cond, BranchInstrs[0]);
+
+  // If that fails, try the unconditional branch.
+  if (hasUnoccupiedSlot(BranchInstrs[1]) && (FalseBB == &Dst))
+    return std::make_pair(MipsInstrInfo::BT_Uncond, BranchInstrs[1]);
+
+  return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+}
+
+bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                         RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                         BB2BrMap &BrMap) const {
+  std::pair<MipsInstrInfo::BranchType, MachineInstr *> P =
+    getBranch(Pred, Succ);
+
+  // Return if either getBranch wasn't able to analyze the branches or there
+  // were no branches with unoccupied slots.
+  if (P.first == MipsInstrInfo::BT_None)
+    return false;
+
+  if ((P.first != MipsInstrInfo::BT_Uncond) &&
+      (P.first != MipsInstrInfo::BT_NoBranch)) {
+    HasMultipleSuccs = true;
+    RegDU.addLiveOut(Pred, Succ);
+  }
+
+  BrMap[&Pred] = P.second;
+  return true;
+}
+
+bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                            InspectMemInstr &IM) const {
+  bool HasHazard = (Candidate.isImplicitDef() || Candidate.isKill());
 
+  HasHazard |= IM.hasHazard(Candidate);
   HasHazard |= RegDU.update(Candidate, 0, Candidate.getNumOperands());
 
   return HasHazard;
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index df52d92..14268d2 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -39,7 +39,7 @@ protected:
   uint64_t estimateStackSize(const MachineFunction &MF) const;
 };
 
-/// Create MipsInstrInfo objects.
+/// Create MipsFrameLowering objects.
 const MipsFrameLowering *createMips16FrameLowering(const MipsSubtarget &ST);
 const MipsFrameLowering *createMipsSEFrameLowering(const MipsSubtarget &ST);
 
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 78c74ef..77b08cb 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -12,19 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-isel"
+#include "MipsISelDAGToDAG.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "MipsSEISelDAGToDAG.h"
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
-#include "MipsSubtarget.h"
-#include "MipsTargetMachine.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
@@ -45,270 +45,11 @@ using namespace llvm;
 // MipsDAGToDAGISel - MIPS specific code to select MIPS machine
 // instructions for SelectionDAG operations.
 //===----------------------------------------------------------------------===//
-namespace {
-
-class MipsDAGToDAGISel : public SelectionDAGISel {
-
-  /// TM - Keep a reference to MipsTargetMachine.
-  MipsTargetMachine &TM;
-
-  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const MipsSubtarget &Subtarget;
-
-public:
-  explicit MipsDAGToDAGISel(MipsTargetMachine &tm) :
-  SelectionDAGISel(tm),
-  TM(tm), Subtarget(tm.getSubtarget<MipsSubtarget>()) {}
-
-  // Pass Name
-  virtual const char *getPassName() const {
-    return "MIPS DAG->DAG Pattern Instruction Selection";
-  }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-private:
-  // Include the pieces autogenerated from the target description.
-  #include "MipsGenDAGISel.inc"
-
-  /// getTargetMachine - Return a reference to the TargetMachine, casted
-  /// to the target-specific type.
-  const MipsTargetMachine &getTargetMachine() {
-    return static_cast<const MipsTargetMachine &>(TM);
-  }
-
-  /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
-  /// to the target-specific type.
-  const MipsInstrInfo *getInstrInfo() {
-    return getTargetMachine().getInstrInfo();
-  }
-
-  SDNode *getGlobalBaseReg();
-
-  SDValue getMips16SPAliasReg();
-
-  void getMips16SPRefReg(SDNode *parent, SDValue &AliasReg);
-
-  std::pair<SDNode*, SDNode*> SelectMULT(SDNode *N, unsigned Opc, DebugLoc dl,
-                                         EVT Ty, bool HasLo, bool HasHi);
-
-  SDNode *Select(SDNode *N);
-
-  // Complex Pattern.
-  /// (reg + imm).
-  bool selectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset) const;
-
-  /// Fall back on this function if all else fails.
-  bool selectAddrDefault(SDValue Addr, SDValue &Base, SDValue &Offset) const;
-
-  /// Match integer address pattern.
-  bool selectIntAddr(SDValue Addr, SDValue &Base, SDValue &Offset) const;
-
-  bool SelectAddr16(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Offset,
-       SDValue &Alias);
-
-  // getImm - Return a target constant with the specified value.
-  inline SDValue getImm(const SDNode *Node, unsigned Imm) {
-    return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
-  }
-
-  void ProcessFunctionAfterISel(MachineFunction &MF);
-  bool ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
-  void InitGlobalBaseReg(MachineFunction &MF);
-  void InitMips16SPAliasReg(MachineFunction &MF);
-
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
-};
-
-}
-
-// Insert instructions to initialize the global base register in the
-// first MBB of the function. When the ABI is O32 and the relocation model is
-// PIC, the necessary instructions are emitted later to prevent optimization
-// passes from moving them.
-void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-
-  if (!MipsFI->globalBaseRegSet())
-    return;
-
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator I = MBB.begin();
-  MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
-  const TargetRegisterClass *RC;
-
-  if (Subtarget.isABI_N64())
-    RC = (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
-  else if (Subtarget.inMips16Mode())
-    RC = (const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
-  else
-    RC = (const TargetRegisterClass*)&Mips::CPURegsRegClass;
-
-  V0 = RegInfo.createVirtualRegister(RC);
-  V1 = RegInfo.createVirtualRegister(RC);
-  V2 = RegInfo.createVirtualRegister(RC);
-
-  if (Subtarget.isABI_N64()) {
-    MF.getRegInfo().addLiveIn(Mips::T9_64);
-    MBB.addLiveIn(Mips::T9_64);
-
-    // lui $v0, %hi(%neg(%gp_rel(fname)))
-    // daddu $v1, $v0, $t9
-    // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
-    const GlobalValue *FName = MF.getFunction();
-    BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
-      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
-    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
-      .addReg(Mips::T9_64);
-    BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
-      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
-    return;
-  }
-
-  if (Subtarget.inMips16Mode()) {
-    BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
-      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
-    BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
-      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
-    BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
-    BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
-      .addReg(V1).addReg(V2);
-    return;
-  }
-
-  if (MF.getTarget().getRelocationModel() == Reloc::Static) {
-    // Set global register to __gnu_local_gp.
-    //
-    // lui   $v0, %hi(__gnu_local_gp)
-    // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
-    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
-      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
-    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
-      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
-    return;
-  }
-
-  MF.getRegInfo().addLiveIn(Mips::T9);
-  MBB.addLiveIn(Mips::T9);
-
-  if (Subtarget.isABI_N32()) {
-    // lui $v0, %hi(%neg(%gp_rel(fname)))
-    // addu $v1, $v0, $t9
-    // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
-    const GlobalValue *FName = MF.getFunction();
-    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
-      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
-    BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
-    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
-      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
-    return;
-  }
-
-  assert(Subtarget.isABI_O32());
-
-  // For O32 ABI, the following instruction sequence is emitted to initialize
-  // the global base register:
-  //
-  //  0. lui   $2, %hi(_gp_disp)
-  //  1. addiu $2, $2, %lo(_gp_disp)
-  //  2. addu  $globalbasereg, $2, $t9
-  //
-  // We emit only the last instruction here.
-  //
-  // GNU linker requires that the first two instructions appear at the beginning
-  // of a function and no instructions be inserted before or between them.
-  // The two instructions are emitted during lowering to MC layer in order to
-  // avoid any reordering.
-  //
-  // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
-  // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
-  // reads it.
-  MF.getRegInfo().addLiveIn(Mips::V0);
-  MBB.addLiveIn(Mips::V0);
-  BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
-    .addReg(Mips::V0).addReg(Mips::T9);
-}
-
-// Insert instructions to initialize the Mips16 SP Alias register in the
-// first MBB of the function.
-//
-void MipsDAGToDAGISel::InitMips16SPAliasReg(MachineFunction &MF) {
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-
-  if (!MipsFI->mips16SPAliasRegSet())
-    return;
-
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator I = MBB.begin();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-  unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
-
-  BuildMI(MBB, I, DL, TII.get(Mips::MoveR3216), Mips16SPAliasReg)
-    .addReg(Mips::SP);
-}
-
-
-bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
-                                              const MachineInstr& MI) {
-  unsigned DstReg = 0, ZeroReg = 0;
-
-  // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
-  if ((MI.getOpcode() == Mips::ADDiu) &&
-      (MI.getOperand(1).getReg() == Mips::ZERO) &&
-      (MI.getOperand(2).getImm() == 0)) {
-    DstReg = MI.getOperand(0).getReg();
-    ZeroReg = Mips::ZERO;
-  } else if ((MI.getOpcode() == Mips::DADDiu) &&
-             (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
-             (MI.getOperand(2).getImm() == 0)) {
-    DstReg = MI.getOperand(0).getReg();
-    ZeroReg = Mips::ZERO_64;
-  }
-
-  if (!DstReg)
-    return false;
-
-  // Replace uses with ZeroReg.
-  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
-       E = MRI->use_end(); U != E;) {
-    MachineOperand &MO = U.getOperand();
-    unsigned OpNo = U.getOperandNo();
-    MachineInstr *MI = MO.getParent();
-    ++U;
-
-    // Do not replace if it is a phi's operand or is tied to def operand.
-    if (MI->isPHI() || MI->isRegTiedToDefOperand(OpNo) || MI->isPseudo())
-      continue;
-
-    MO.setReg(ZeroReg);
-  }
-
-  return true;
-}
-
-void MipsDAGToDAGISel::ProcessFunctionAfterISel(MachineFunction &MF) {
-  InitGlobalBaseReg(MF);
-  InitMips16SPAliasReg(MF);
-
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
-
-  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
-       ++MFI)
-    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
-      ReplaceUsesWithZeroReg(MRI, *I);
-}
 
 bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
-  ProcessFunctionAfterISel(MF);
+  processFunctionAfterISel(MF);
 
   return Ret;
 }
@@ -320,233 +61,36 @@ SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
   return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
 }
 
-/// getMips16SPAliasReg - Output the instructions required to put the
-/// SP into a Mips16 accessible aliased register.
-SDValue MipsDAGToDAGISel::getMips16SPAliasReg() {
-  unsigned Mips16SPAliasReg =
-    MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
-  return CurDAG->getRegister(Mips16SPAliasReg, TLI.getPointerTy());
-}
-
 /// ComplexPattern used on MipsInstrInfo
 /// Used on Mips Load/Store instructions
 bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
                                         SDValue &Offset) const {
-  EVT ValTy = Addr.getValueType();
-
-  // if Address is FI, get the TargetFrameIndex.
-  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-    Offset = CurDAG->getTargetConstant(0, ValTy);
-    return true;
-  }
-
-  // on PIC code Load GA
-  if (Addr.getOpcode() == MipsISD::Wrapper) {
-    Base   = Addr.getOperand(0);
-    Offset = Addr.getOperand(1);
-    return true;
-  }
-
-  if (TM.getRelocationModel() != Reloc::PIC_) {
-    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
-        Addr.getOpcode() == ISD::TargetGlobalAddress))
-      return false;
-  }
-
-  // Addresses of the form FI+const or FI|const
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    if (isInt<16>(CN->getSExtValue())) {
-
-      // If the first operand is a FI, get the TargetFI Node
-      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
-                                  (Addr.getOperand(0)))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-      else
-        Base = Addr.getOperand(0);
-
-      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
-      return true;
-    }
-  }
-
-  // Operand is a result from an ADD.
-  if (Addr.getOpcode() == ISD::ADD) {
-    // When loading from constant pools, load the lower address part in
-    // the instruction itself. Example, instead of:
-    //  lui $2, %hi($CPI1_0)
-    //  addiu $2, $2, %lo($CPI1_0)
-    //  lwc1 $f0, 0($2)
-    // Generate:
-    //  lui $2, %hi($CPI1_0)
-    //  lwc1 $f0, %lo($CPI1_0)($2)
-    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
-        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
-      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
-      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
-          isa<JumpTableSDNode>(Opnd0)) {
-        Base = Addr.getOperand(0);
-        Offset = Opnd0;
-        return true;
-      }
-    }
-  }
-
+  llvm_unreachable("Unimplemented function.");
   return false;
 }
 
 bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) const {
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, Addr.getValueType());
-  return true;
+  llvm_unreachable("Unimplemented function.");
+  return false;
 }
 
 bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
                                      SDValue &Offset) const {
-  return selectAddrRegImm(Addr, Base, Offset) ||
-    selectAddrDefault(Addr, Base, Offset);
-}
-
-void MipsDAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
-  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI.getPointerTy());
-  if (Parent) {
-    switch (Parent->getOpcode()) {
-      case ISD::LOAD: {
-        LoadSDNode *SD = dyn_cast<LoadSDNode>(Parent);
-        switch (SD->getMemoryVT().getSizeInBits()) {
-        case 8:
-        case 16:
-          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
-            AliasFPReg: getMips16SPAliasReg();
-          return;
-        }
-        break;
-      }
-      case ISD::STORE: {
-        StoreSDNode *SD = dyn_cast<StoreSDNode>(Parent);
-        switch (SD->getMemoryVT().getSizeInBits()) {
-        case 8:
-        case 16:
-          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
-            AliasFPReg: getMips16SPAliasReg();
-          return;
-        }
-        break;
-      }
-    }
-  }
-  AliasReg = CurDAG->getRegister(Mips::SP, TLI.getPointerTy());
-  return;
-
-}
-bool MipsDAGToDAGISel::SelectAddr16(
-  SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset,
-  SDValue &Alias) {
-  EVT ValTy = Addr.getValueType();
-
-  Alias = CurDAG->getTargetConstant(0, ValTy);
-
-  // if Address is FI, get the TargetFrameIndex.
-  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-    Offset = CurDAG->getTargetConstant(0, ValTy);
-    getMips16SPRefReg(Parent, Alias);
-    return true;
-  }
-  // on PIC code Load GA
-  if (Addr.getOpcode() == MipsISD::Wrapper) {
-    Base   = Addr.getOperand(0);
-    Offset = Addr.getOperand(1);
-    return true;
-  }
-  if (TM.getRelocationModel() != Reloc::PIC_) {
-    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
-        Addr.getOpcode() == ISD::TargetGlobalAddress))
-      return false;
-  }
-  // Addresses of the form FI+const or FI|const
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    if (isInt<16>(CN->getSExtValue())) {
-
-      // If the first operand is a FI, get the TargetFI Node
-      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
-                                  (Addr.getOperand(0))) {
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-        getMips16SPRefReg(Parent, Alias);
-      }
-      else
-        Base = Addr.getOperand(0);
-
-      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
-      return true;
-    }
-  }
-  // Operand is a result from an ADD.
-  if (Addr.getOpcode() == ISD::ADD) {
-    // When loading from constant pools, load the lower address part in
-    // the instruction itself. Example, instead of:
-    //  lui $2, %hi($CPI1_0)
-    //  addiu $2, $2, %lo($CPI1_0)
-    //  lwc1 $f0, 0($2)
-    // Generate:
-    //  lui $2, %hi($CPI1_0)
-    //  lwc1 $f0, %lo($CPI1_0)($2)
-    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
-        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
-      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
-      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
-          isa<JumpTableSDNode>(Opnd0)) {
-        Base = Addr.getOperand(0);
-        Offset = Opnd0;
-        return true;
-      }
-    }
-
-    // If an indexed floating point load/store can be emitted, return false.
-    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
-
-    if (LS &&
-        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
-        Subtarget.hasFPIdx())
-      return false;
-  }
-  Base   = Addr;
-  Offset = CurDAG->getTargetConstant(0, ValTy);
-  return true;
+  llvm_unreachable("Unimplemented function.");
+  return false;
 }
 
-/// Select multiply instructions.
-std::pair<SDNode*, SDNode*>
-MipsDAGToDAGISel::SelectMULT(SDNode *N, unsigned Opc, DebugLoc dl, EVT Ty,
-                             bool HasLo, bool HasHi) {
-  SDNode *Lo = 0, *Hi = 0;
-  SDNode *Mul = CurDAG->getMachineNode(Opc, dl, MVT::Glue, N->getOperand(0),
-                                       N->getOperand(1));
-  SDValue InFlag = SDValue(Mul, 0);
-
-  if (HasLo) {
-    unsigned Opcode = Subtarget.inMips16Mode() ? Mips::Mflo16 :
-      (Ty == MVT::i32 ? Mips::MFLO : Mips::MFLO64);
-    Lo = CurDAG->getMachineNode(Opcode, dl, Ty, MVT::Glue, InFlag);
-    InFlag = SDValue(Lo, 1);
-  }
-  if (HasHi) {
-    unsigned Opcode = Subtarget.inMips16Mode() ? Mips::Mfhi16 :
-      (Ty == MVT::i32 ? Mips::MFHI : Mips::MFHI64);
-    Hi = CurDAG->getMachineNode(Opcode, dl, Ty, InFlag);
-  }
-  return std::make_pair(Lo, Hi);
+bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                                    SDValue &Offset, SDValue &Alias) {
+  llvm_unreachable("Unimplemented function.");
+  return false;
 }
 
-
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
-  DebugLoc dl = Node->getDebugLoc();
 
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
@@ -557,167 +101,19 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     return NULL;
   }
 
-  ///
-  // Instruction Selection not handled by the auto-generated
-  // tablegen selection should be handled here.
-  ///
-  EVT NodeTy = Node->getValueType(0);
-  unsigned MultOpc;
+  // See if subclasses can handle this node.
+  std::pair<bool, SDNode*> Ret = selectNode(Node);
+
+  if (Ret.first)
+    return Ret.second;
 
   switch(Opcode) {
   default: break;
 
-  case ISD::SUBE:
-  case ISD::ADDE: {
-    bool inMips16Mode = Subtarget.inMips16Mode();
-    SDValue InFlag = Node->getOperand(2), CmpLHS;
-    unsigned Opc = InFlag.getOpcode(); (void)Opc;
-    assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
-            (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
-           "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
-    unsigned MOp;
-    if (Opcode == ISD::ADDE) {
-      CmpLHS = InFlag.getValue(0);
-      if (inMips16Mode)
-        MOp = Mips::AdduRxRyRz16;
-      else
-        MOp = Mips::ADDu;
-    } else {
-      CmpLHS = InFlag.getOperand(0);
-      if (inMips16Mode)
-        MOp = Mips::SubuRxRyRz16;
-      else
-        MOp = Mips::SUBu;
-    }
-
-    SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
-
-    SDValue LHS = Node->getOperand(0);
-    SDValue RHS = Node->getOperand(1);
-
-    EVT VT = LHS.getValueType();
-
-    unsigned Sltu_op = inMips16Mode? Mips::SltuRxRyRz16: Mips::SLTu;
-    SDNode *Carry = CurDAG->getMachineNode(Sltu_op, dl, VT, Ops, 2);
-    unsigned Addu_op = inMips16Mode? Mips::AdduRxRyRz16 : Mips::ADDu;
-    SDNode *AddCarry = CurDAG->getMachineNode(Addu_op, dl, VT,
-                                              SDValue(Carry,0), RHS);
-
-    return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue,
-                                LHS, SDValue(AddCarry,0));
-  }
-
-  /// Mul with two results
-  case ISD::SMUL_LOHI:
-  case ISD::UMUL_LOHI: {
-    if (NodeTy == MVT::i32) {
-      if (Subtarget.inMips16Mode())
-        MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 :
-                   Mips::MultRxRy16);
-      else
-        MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
-    }
-    else
-      MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::DMULTu : Mips::DMULT);
-
-    std::pair<SDNode*, SDNode*> LoHi = SelectMULT(Node, MultOpc, dl, NodeTy,
-                                                  true, true);
-
-    if (!SDValue(Node, 0).use_empty())
-      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
-
-    if (!SDValue(Node, 1).use_empty())
-      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
-
-    return NULL;
-  }
-
-  /// Special Muls
-  case ISD::MUL: {
-    // Mips32 has a 32-bit three operand mul instruction.
-    if (Subtarget.hasMips32() && NodeTy == MVT::i32)
-      break;
-    return SelectMULT(Node, NodeTy == MVT::i32 ? Mips::MULT : Mips::DMULT,
-                      dl, NodeTy, true, false).first;
-  }
-  case ISD::MULHS:
-  case ISD::MULHU: {
-    if (NodeTy == MVT::i32) {
-      if (Subtarget.inMips16Mode())
-        MultOpc = (Opcode == ISD::MULHU ?
-                   Mips::MultuRxRy16 : Mips::MultRxRy16);
-      else
-        MultOpc = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
-    }
-    else
-      MultOpc = (Opcode == ISD::MULHU ? Mips::DMULTu : Mips::DMULT);
-
-    return SelectMULT(Node, MultOpc, dl, NodeTy, false, true).second;
-  }
-
   // Get target GOT address.
   case ISD::GLOBAL_OFFSET_TABLE:
     return getGlobalBaseReg();
 
-  case ISD::ConstantFP: {
-    ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
-    if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
-      if (Subtarget.hasMips64()) {
-        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              Mips::ZERO_64, MVT::i64);
-        return CurDAG->getMachineNode(Mips::DMTC1, dl, MVT::f64, Zero);
-      }
-
-      SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                            Mips::ZERO, MVT::i32);
-      return CurDAG->getMachineNode(Mips::BuildPairF64, dl, MVT::f64, Zero,
-                                    Zero);
-    }
-    break;
-  }
-
-  case ISD::Constant: {
-    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
-    unsigned Size = CN->getValueSizeInBits(0);
-
-    if (Size == 32)
-      break;
-
-    MipsAnalyzeImmediate AnalyzeImm;
-    int64_t Imm = CN->getSExtValue();
-
-    const MipsAnalyzeImmediate::InstSeq &Seq =
-      AnalyzeImm.Analyze(Imm, Size, false);
-
-    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-    DebugLoc DL = CN->getDebugLoc();
-    SDNode *RegOpnd;
-    SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
-                                                MVT::i64);
-
-    // The first instruction can be a LUi which is different from other
-    // instructions (ADDiu, ORI and SLL) in that it does not have a register
-    // operand.
-    if (Inst->Opc == Mips::LUi64)
-      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64, ImmOpnd);
-    else
-      RegOpnd =
-        CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
-                               CurDAG->getRegister(Mips::ZERO_64, MVT::i64),
-                               ImmOpnd);
-
-    // The remaining instructions in the sequence are handled here.
-    for (++Inst; Inst != Seq.end(); ++Inst) {
-      ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
-                                          MVT::i64);
-      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
-                                       SDValue(RegOpnd, 0), ImmOpnd);
-    }
-
-    return RegOpnd;
-  }
-
 #ifndef NDEBUG
   case ISD::LOAD:
   case ISD::STORE:
@@ -726,31 +122,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
            "Unexpected unaligned loads/stores.");
     break;
 #endif
-
-  case MipsISD::ThreadPointer: {
-    EVT PtrVT = TLI.getPointerTy();
-    unsigned RdhwrOpc, SrcReg, DestReg;
-
-    if (PtrVT == MVT::i32) {
-      RdhwrOpc = Mips::RDHWR;
-      SrcReg = Mips::HWR29;
-      DestReg = Mips::V1;
-    } else {
-      RdhwrOpc = Mips::RDHWR64;
-      SrcReg = Mips::HWR29_64;
-      DestReg = Mips::V1_64;
-    }
-
-    SDNode *Rdhwr =
-      CurDAG->getMachineNode(RdhwrOpc, Node->getDebugLoc(),
-                             Node->getValueType(0),
-                             CurDAG->getRegister(SrcReg, PtrVT));
-    SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, DestReg,
-                                         SDValue(Rdhwr, 0));
-    SDValue ResNode = CurDAG->getCopyFromReg(Chain, dl, DestReg, PtrVT);
-    ReplaceUses(SDValue(Node, 0), ResNode);
-    return ResNode.getNode();
-  }
   }
 
   // Select the default instruction
@@ -776,5 +147,8 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
 /// createMipsISelDag - This pass converts a legalized DAG into a
 /// MIPS-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
-  return new MipsDAGToDAGISel(TM);
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16ISelDag(TM);
+
+  return llvm::createMipsSEISelDag(TM);
 }
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
new file mode 100644
index 0000000..cf0f9c5
--- /dev/null
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -0,0 +1,93 @@
+//===---- MipsISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSISELDAGTODAG_H
+#define MIPSISELDAGTODAG_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class MipsDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit MipsDAGToDAGISel(MipsTargetMachine &TM)
+    : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+protected:
+  SDNode *getGlobalBaseReg();
+
+  /// Keep a pointer to the MipsSubtarget around so that we can make the right
+  /// decision when generating code for different targets.
+  const MipsSubtarget &Subtarget;
+
+private:
+  // Include the pieces autogenerated from the target description.
+  #include "MipsGenDAGISel.inc"
+
+  // Complex Pattern.
+  /// (reg + imm).
+  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  /// Fall back on this function if all else fails.
+  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  /// Match integer address pattern.
+  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
+                             SDValue &Offset) const;
+
+  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                            SDValue &Offset, SDValue &Alias);
+
+  virtual SDNode *Select(SDNode *N);
+
+  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
+
+  // getImm - Return a target constant with the specified value.
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+    return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
+  }
+
+  virtual void processFunctionAfterISel(MachineFunction &MF) = 0;
+
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+};
+
+/// createMipsISelDag - This pass converts a legalized DAG into a
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 36e1a15..4bf43f4 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "mips-lower"
-#include <set>
 #include "MipsISelLowering.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
@@ -30,7 +29,6 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
@@ -43,26 +41,9 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 static cl::opt<bool>
-EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
-                    cl::desc("MIPS: Enable tail calls."), cl::init(false));
-
-static cl::opt<bool>
 LargeGOT("mxgot", cl::Hidden,
          cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
 
-static cl::opt<bool>
-Mips16HardFloat("mips16-hard-float", cl::NotHidden,
-                cl::desc("MIPS: mips16 hard float enable."),
-                cl::init(false));
-
-static cl::opt<bool> DontExpandCondPseudos16(
-  "mips16-dont-expand-cond-pseudo",
-  cl::init(false),
-  cl::desc("Dont expand conditional move related "
-           "pseudos for Mips 16"),
-  cl::Hidden);
-
-
 static const uint16_t O32IntRegs[4] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
@@ -80,7 +61,7 @@ static const uint16_t Mips64DPRegs[8] = {
 // If I is a shifted mask, set the size (Size) and the first bit of the
 // mask (Pos), and return true.
 // For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
-static bool IsShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
+static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   if (!isShiftedMask_64(I))
      return false;
 
@@ -89,7 +70,7 @@ static bool IsShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   return true;
 }
 
-static SDValue GetGlobalReg(SelectionDAG &DAG, EVT Ty) {
+SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
   MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
   return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
 }
@@ -124,11 +105,12 @@ static SDValue getAddrNonPIC(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
 }
 
-static SDValue getAddrLocal(SDValue Op, SelectionDAG &DAG, bool HasMips64) {
+SDValue MipsTargetLowering::getAddrLocal(SDValue Op, SelectionDAG &DAG,
+                                         bool HasMips64) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT Ty = Op.getValueType();
   unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
-  SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, GetGlobalReg(DAG, Ty),
+  SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                             getTargetNode(Op, DAG, GOTFlag));
   SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
                              MachinePointerInfo::getGOT(), false, false, false,
@@ -138,21 +120,23 @@ static SDValue getAddrLocal(SDValue Op, SelectionDAG &DAG, bool HasMips64) {
   return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
 }
 
-static SDValue getAddrGlobal(SDValue Op, SelectionDAG &DAG, unsigned Flag) {
+SDValue MipsTargetLowering::getAddrGlobal(SDValue Op, SelectionDAG &DAG,
+                                          unsigned Flag) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT Ty = Op.getValueType();
-  SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, GetGlobalReg(DAG, Ty),
+  SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                             getTargetNode(Op, DAG, Flag));
   return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Tgt,
                      MachinePointerInfo::getGOT(), false, false, false, 0);
 }
 
-static SDValue getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
-                                     unsigned HiFlag, unsigned LoFlag) {
+SDValue MipsTargetLowering::getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
+                                                  unsigned HiFlag,
+                                                  unsigned LoFlag) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT Ty = Op.getValueType();
   SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(Op, DAG, HiFlag));
-  Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, GetGlobalReg(DAG, Ty));
+  Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
   SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
                                 getTargetNode(Op, DAG, LoFlag));
   return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Wrapper,
@@ -204,7 +188,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::MTHLIP:            return "MipsISD::MTHLIP";
   case MipsISD::MULT:              return "MipsISD::MULT";
   case MipsISD::MULTU:             return "MipsISD::MULTU";
-  case MipsISD::MADD_DSP:          return "MipsISD::MADD_DSPDSP";
+  case MipsISD::MADD_DSP:          return "MipsISD::MADD_DSP";
   case MipsISD::MADDU_DSP:         return "MipsISD::MADDU_DSP";
   case MipsISD::MSUB_DSP:          return "MipsISD::MSUB_DSP";
   case MipsISD::MSUBU_DSP:         return "MipsISD::MSUBU_DSP";
@@ -212,110 +196,17 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-namespace {
-  struct ltstr {
-    bool operator()(const char *s1, const char *s2) const
-    {
-      return strcmp(s1, s2) < 0;
-    }
-  };
-
-  std::set<const char*, ltstr> noHelperNeeded;
-}
-
-void MipsTargetLowering::SetMips16LibcallName
-  (RTLIB::Libcall l, const char *Name) {
-  setLibcallName(l, Name);
-  noHelperNeeded.insert(Name);
-}
-
-void MipsTargetLowering::setMips16HardFloatLibCalls() {
-  SetMips16LibcallName(RTLIB::ADD_F32, "__mips16_addsf3");
-  SetMips16LibcallName(RTLIB::ADD_F64, "__mips16_adddf3");
-  SetMips16LibcallName(RTLIB::SUB_F32, "__mips16_subsf3");
-  SetMips16LibcallName(RTLIB::SUB_F64, "__mips16_subdf3");
-  SetMips16LibcallName(RTLIB::MUL_F32, "__mips16_mulsf3");
-  SetMips16LibcallName(RTLIB::MUL_F64, "__mips16_muldf3");
-  SetMips16LibcallName(RTLIB::DIV_F32, "__mips16_divsf3");
-  SetMips16LibcallName(RTLIB::DIV_F64, "__mips16_divdf3");
-  SetMips16LibcallName(RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2");
-  SetMips16LibcallName(RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2");
-  SetMips16LibcallName(RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi");
-  SetMips16LibcallName(RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi");
-  SetMips16LibcallName(RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf");
-  SetMips16LibcallName(RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf");
-  SetMips16LibcallName(RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf");
-  SetMips16LibcallName(RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf");
-  SetMips16LibcallName(RTLIB::OEQ_F32, "__mips16_eqsf2");
-  SetMips16LibcallName(RTLIB::OEQ_F64, "__mips16_eqdf2");
-  SetMips16LibcallName(RTLIB::UNE_F32, "__mips16_nesf2");
-  SetMips16LibcallName(RTLIB::UNE_F64, "__mips16_nedf2");
-  SetMips16LibcallName(RTLIB::OGE_F32, "__mips16_gesf2");
-  SetMips16LibcallName(RTLIB::OGE_F64, "__mips16_gedf2");
-  SetMips16LibcallName(RTLIB::OLT_F32, "__mips16_ltsf2");
-  SetMips16LibcallName(RTLIB::OLT_F64, "__mips16_ltdf2");
-  SetMips16LibcallName(RTLIB::OLE_F32, "__mips16_lesf2");
-  SetMips16LibcallName(RTLIB::OLE_F64, "__mips16_ledf2");
-  SetMips16LibcallName(RTLIB::OGT_F32, "__mips16_gtsf2");
-  SetMips16LibcallName(RTLIB::OGT_F64, "__mips16_gtdf2");
-  SetMips16LibcallName(RTLIB::UO_F32, "__mips16_unordsf2");
-  SetMips16LibcallName(RTLIB::UO_F64, "__mips16_unorddf2");
-  SetMips16LibcallName(RTLIB::O_F32, "__mips16_unordsf2");
-  SetMips16LibcallName(RTLIB::O_F64, "__mips16_unorddf2");
-}
-
 MipsTargetLowering::
 MipsTargetLowering(MipsTargetMachine &TM)
   : TargetLowering(TM, new MipsTargetObjectFile()),
     Subtarget(&TM.getSubtarget<MipsSubtarget>()),
     HasMips64(Subtarget->hasMips64()), IsN64(Subtarget->isABI_N64()),
     IsO32(Subtarget->isABI_O32()) {
-
   // Mips does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
-  // Set up the register classes
-  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
-
-  if (HasMips64)
-    addRegisterClass(MVT::i64, &Mips::CPU64RegsRegClass);
-
-  if (Subtarget->inMips16Mode()) {
-    addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
-    if (Mips16HardFloat)
-      setMips16HardFloatLibCalls();
-  }
-
-  if (Subtarget->hasDSP()) {
-    MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
-
-    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
-      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
-
-      // Expand all builtin opcodes.
-      for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
-        setOperationAction(Opc, VecTys[i], Expand);
-
-      setOperationAction(ISD::LOAD, VecTys[i], Legal);
-      setOperationAction(ISD::STORE, VecTys[i], Legal);
-      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
-    }
-  }
-
-  if (!TM.Options.UseSoftFloat) {
-    addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
-
-    // When dealing with single precision only, use libcalls
-    if (!Subtarget->isSingleFloat()) {
-      if (HasMips64)
-        addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
-      else
-        addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
-    }
-  }
-
   // Load extented operations for i1 types must be promoted
   setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
@@ -332,6 +223,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
 
   // Mips Custom Operations
+  setOperationAction(ISD::BR_JT,              MVT::Other, Custom);
   setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
   setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
@@ -348,18 +240,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
-  if (Subtarget->inMips16Mode()) {
-    setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
-    setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
-  }
-  else {
-    setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
-    setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
-  }
-  if (!Subtarget->inMips16Mode()) {
-    setOperationAction(ISD::LOAD,               MVT::i32, Custom);
-    setOperationAction(ISD::STORE,              MVT::i32, Custom);
-  }
 
   if (!TM.Options.NoNaNsFPMath) {
     setOperationAction(ISD::FABS,             MVT::f32,   Custom);
@@ -397,8 +277,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
   // Operations not directly supported by Mips.
-  setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC,             MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
   setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
@@ -470,21 +352,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);
   setOperationAction(ISD::ATOMIC_STORE,      MVT::i64,    Expand);
 
-  if (Subtarget->inMips16Mode()) {
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,       MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_SWAP,           MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,       MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,       MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,       MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,        MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,       MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND,      MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN,       MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX,       MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN,      MVT::i32,    Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX,      MVT::i32,    Expand);
-  }
-
   setInsertFencesForAtomic(true);
 
   if (!Subtarget->hasSEInReg()) {
@@ -521,7 +388,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setMinFunctionAlignment(HasMips64 ? 3 : 2);
 
   setStackPointerRegisterToSaveRestore(IsN64 ? Mips::SP_64 : Mips::SP);
-  computeRegisterProperties();
 
   setExceptionPointerRegister(IsN64 ? Mips::A0_64 : Mips::A0);
   setExceptionSelectorRegister(IsN64 ? Mips::A1_64 : Mips::A1);
@@ -529,22 +395,11 @@ MipsTargetLowering(MipsTargetMachine &TM)
   MaxStoresPerMemcpy = 16;
 }
 
-bool
-MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
-  MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
-
-  if (Subtarget->inMips16Mode())
-    return false;
+const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16TargetLowering(TM);
 
-  switch (SVT) {
-  case MVT::i64:
-  case MVT::i32:
-    if (Fast)
-      *Fast = true;
-    return true;
-  default:
-    return false;
-  }
+  return llvm::createMipsSETargetLowering(TM);
 }
 
 EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
@@ -553,7 +408,7 @@ EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
   return VT.changeVectorElementTypeToInteger();
 }
 
-// SelectMadd -
+// selectMADD -
 // Transforms a subgraph in CurDAG if the following pattern is found:
 //  (addc multLo, Lo0), (adde multHi, Hi0),
 // where,
@@ -561,7 +416,7 @@ EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
 // Return true if pattern matching was successful.
-static bool SelectMadd(SDNode *ADDENode, SelectionDAG *CurDAG) {
+static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
   // ADDENode's second operand must be a flag output of an ADDC node in order
   // for the matching to be successful.
   SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
@@ -598,21 +453,21 @@ static bool SelectMadd(SDNode *ADDENode, SelectionDAG *CurDAG) {
     return false;
 
   SDValue Chain = CurDAG->getEntryNode();
-  DebugLoc dl = ADDENode->getDebugLoc();
+  DebugLoc DL = ADDENode->getDebugLoc();
 
   // create MipsMAdd(u) node
   MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
 
-  SDValue MAdd = CurDAG->getNode(MultOpc, dl, MVT::Glue,
+  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Glue,
                                  MultNode->getOperand(0),// Factor 0
                                  MultNode->getOperand(1),// Factor 1
                                  ADDCNode->getOperand(1),// Lo0
                                  ADDENode->getOperand(1));// Hi0
 
   // create CopyFromReg nodes
-  SDValue CopyFromLo = CurDAG->getCopyFromReg(Chain, dl, Mips::LO, MVT::i32,
+  SDValue CopyFromLo = CurDAG->getCopyFromReg(Chain, DL, Mips::LO, MVT::i32,
                                               MAdd);
-  SDValue CopyFromHi = CurDAG->getCopyFromReg(CopyFromLo.getValue(1), dl,
+  SDValue CopyFromHi = CurDAG->getCopyFromReg(CopyFromLo.getValue(1), DL,
                                               Mips::HI, MVT::i32,
                                               CopyFromLo.getValue(2));
 
@@ -626,7 +481,7 @@ static bool SelectMadd(SDNode *ADDENode, SelectionDAG *CurDAG) {
   return true;
 }
 
-// SelectMsub -
+// selectMSUB -
 // Transforms a subgraph in CurDAG if the following pattern is found:
 //  (addc Lo0, multLo), (sube Hi0, multHi),
 // where,
@@ -634,7 +489,7 @@ static bool SelectMadd(SDNode *ADDENode, SelectionDAG *CurDAG) {
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
 // Return true if pattern matching was successful.
-static bool SelectMsub(SDNode *SUBENode, SelectionDAG *CurDAG) {
+static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
   // SUBENode's second operand must be a flag output of an SUBC node in order
   // for the matching to be successful.
   SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
@@ -671,21 +526,21 @@ static bool SelectMsub(SDNode *SUBENode, SelectionDAG *CurDAG) {
     return false;
 
   SDValue Chain = CurDAG->getEntryNode();
-  DebugLoc dl = SUBENode->getDebugLoc();
+  DebugLoc DL = SUBENode->getDebugLoc();
 
   // create MipsSub(u) node
   MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
 
-  SDValue MSub = CurDAG->getNode(MultOpc, dl, MVT::Glue,
+  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
                                  MultNode->getOperand(0),// Factor 0
                                  MultNode->getOperand(1),// Factor 1
                                  SUBCNode->getOperand(0),// Lo0
                                  SUBENode->getOperand(0));// Hi0
 
   // create CopyFromReg nodes
-  SDValue CopyFromLo = CurDAG->getCopyFromReg(Chain, dl, Mips::LO, MVT::i32,
+  SDValue CopyFromLo = CurDAG->getCopyFromReg(Chain, DL, Mips::LO, MVT::i32,
                                               MSub);
-  SDValue CopyFromHi = CurDAG->getCopyFromReg(CopyFromLo.getValue(1), dl,
+  SDValue CopyFromHi = CurDAG->getCopyFromReg(CopyFromLo.getValue(1), DL,
                                               Mips::HI, MVT::i32,
                                               CopyFromLo.getValue(2));
 
@@ -699,33 +554,33 @@ static bool SelectMsub(SDNode *SUBENode, SelectionDAG *CurDAG) {
   return true;
 }
 
-static SDValue PerformADDECombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
   if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 &&
-      SelectMadd(N, &DAG))
+      selectMADD(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
-static SDValue PerformSUBECombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
   if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 &&
-      SelectMsub(N, &DAG))
+      selectMSUB(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
-static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
@@ -734,18 +589,18 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG &DAG,
   EVT Ty = N->getValueType(0);
   unsigned LO = (Ty == MVT::i32) ? Mips::LO : Mips::LO64;
   unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64;
-  unsigned opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem :
+  unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem :
                                                   MipsISD::DivRemU;
-  DebugLoc dl = N->getDebugLoc();
+  DebugLoc DL = N->getDebugLoc();
 
-  SDValue DivRem = DAG.getNode(opc, dl, MVT::Glue,
+  SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
                                N->getOperand(0), N->getOperand(1));
   SDValue InChain = DAG.getEntryNode();
   SDValue InGlue = DivRem;
 
   // insert MFLO
   if (N->hasAnyUseOfValue(0)) {
-    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, dl, LO, Ty,
+    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, DL, LO, Ty,
                                             InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
     InChain = CopyFromLo.getValue(1);
@@ -754,7 +609,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG &DAG,
 
   // insert MFHI
   if (N->hasAnyUseOfValue(1)) {
-    SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl,
+    SDValue CopyFromHi = DAG.getCopyFromReg(InChain, DL,
                                             HI, Ty, InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
   }
@@ -790,7 +645,7 @@ static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
 
 
 // Returns true if condition code has to be inverted.
-static bool InvertFPCondCode(Mips::CondCode CC) {
+static bool invertFPCondCode(Mips::CondCode CC) {
   if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
     return false;
 
@@ -802,7 +657,7 @@ static bool InvertFPCondCode(Mips::CondCode CC) {
 
 // Creates and returns an FPCmp node from a setcc node.
 // Returns Op if setcc is not a floating point comparison.
-static SDValue CreateFPCmp(SelectionDAG &DAG, const SDValue &Op) {
+static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
   // must be a SETCC node
   if (Op.getOpcode() != ISD::SETCC)
     return Op;
@@ -813,20 +668,20 @@ static SDValue CreateFPCmp(SelectionDAG &DAG, const SDValue &Op) {
     return Op;
 
   SDValue RHS = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  DebugLoc DL = Op.getDebugLoc();
 
   // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
   // node if necessary.
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
-  return DAG.getNode(MipsISD::FPCmp, dl, MVT::Glue, LHS, RHS,
+  return DAG.getNode(MipsISD::FPCmp, DL, MVT::Glue, LHS, RHS,
                      DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32));
 }
 
 // Creates and returns a CMovFPT/F node.
-static SDValue CreateCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
+static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
                             SDValue False, DebugLoc DL) {
-  bool invert = InvertFPCondCode((Mips::CondCode)
+  bool invert = invertFPCondCode((Mips::CondCode)
                                  cast<ConstantSDNode>(Cond.getOperand(2))
                                  ->getSExtValue());
 
@@ -834,7 +689,7 @@ static SDValue CreateCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
                      True.getValueType(), True, False, Cond);
 }
 
-static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
@@ -867,7 +722,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
 }
 
-static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget *Subtarget) {
   // Pattern match EXT.
@@ -893,7 +748,7 @@ static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
 
   // Op's second operand must be a shifted mask.
   if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
-      !IsShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
     return SDValue();
 
   // Return if the shifted mask does not start at bit 0 or the sum of its size
@@ -907,7 +762,7 @@ static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getConstant(SMSize, MVT::i32));
 }
 
-static SDValue PerformORCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const MipsSubtarget *Subtarget) {
   // Pattern match INS.
@@ -926,7 +781,7 @@ static SDValue PerformORCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
-      !IsShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+      !isShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
     return SDValue();
 
   // See if Op's second operand matches (and (shl $src, pos), mask1).
@@ -934,7 +789,7 @@ static SDValue PerformORCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
-      !IsShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+      !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
     return SDValue();
 
   // The shift masks must have the same position and size.
@@ -961,7 +816,7 @@ static SDValue PerformORCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0));
 }
 
-static SDValue PerformADDCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget *Subtarget) {
   // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
@@ -991,25 +846,25 @@ static SDValue PerformADDCombine(SDNode *N, SelectionDAG &DAG,
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
-  unsigned opc = N->getOpcode();
+  unsigned Opc = N->getOpcode();
 
-  switch (opc) {
+  switch (Opc) {
   default: break;
   case ISD::ADDE:
-    return PerformADDECombine(N, DAG, DCI, Subtarget);
+    return performADDECombine(N, DAG, DCI, Subtarget);
   case ISD::SUBE:
-    return PerformSUBECombine(N, DAG, DCI, Subtarget);
+    return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::SDIVREM:
   case ISD::UDIVREM:
-    return PerformDivRemCombine(N, DAG, DCI, Subtarget);
+    return performDivRemCombine(N, DAG, DCI, Subtarget);
   case ISD::SELECT:
-    return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+    return performSELECTCombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
-    return PerformANDCombine(N, DAG, DCI, Subtarget);
+    return performANDCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
-    return PerformORCombine(N, DAG, DCI, Subtarget);
+    return performORCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:
-    return PerformADDCombine(N, DAG, DCI, Subtarget);
+    return performADDCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -1040,31 +895,32 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
 {
   switch (Op.getOpcode())
   {
-    case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
-    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
-    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
-    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
-    case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
-    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
-    case ISD::SELECT:             return LowerSELECT(Op, DAG);
-    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
-    case ISD::SETCC:              return LowerSETCC(Op, DAG);
-    case ISD::VASTART:            return LowerVASTART(Op, DAG);
-    case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
-    case ISD::FABS:               return LowerFABS(Op, DAG);
-    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
-    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
-    case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
-    case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
-    case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
-    case ISD::SHL_PARTS:          return LowerShiftLeftParts(Op, DAG);
-    case ISD::SRA_PARTS:          return LowerShiftRightParts(Op, DAG, true);
-    case ISD::SRL_PARTS:          return LowerShiftRightParts(Op, DAG, false);
-    case ISD::LOAD:               return LowerLOAD(Op, DAG);
-    case ISD::STORE:              return LowerSTORE(Op, DAG);
-    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-    case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
-    case ISD::ADD:                return LowerADD(Op, DAG);
+    case ISD::BR_JT:              return lowerBR_JT(Op, DAG);
+    case ISD::BRCOND:             return lowerBRCOND(Op, DAG);
+    case ISD::ConstantPool:       return lowerConstantPool(Op, DAG);
+    case ISD::GlobalAddress:      return lowerGlobalAddress(Op, DAG);
+    case ISD::BlockAddress:       return lowerBlockAddress(Op, DAG);
+    case ISD::GlobalTLSAddress:   return lowerGlobalTLSAddress(Op, DAG);
+    case ISD::JumpTable:          return lowerJumpTable(Op, DAG);
+    case ISD::SELECT:             return lowerSELECT(Op, DAG);
+    case ISD::SELECT_CC:          return lowerSELECT_CC(Op, DAG);
+    case ISD::SETCC:              return lowerSETCC(Op, DAG);
+    case ISD::VASTART:            return lowerVASTART(Op, DAG);
+    case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
+    case ISD::FABS:               return lowerFABS(Op, DAG);
+    case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
+    case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
+    case ISD::EH_RETURN:          return lowerEH_RETURN(Op, DAG);
+    case ISD::MEMBARRIER:         return lowerMEMBARRIER(Op, DAG);
+    case ISD::ATOMIC_FENCE:       return lowerATOMIC_FENCE(Op, DAG);
+    case ISD::SHL_PARTS:          return lowerShiftLeftParts(Op, DAG);
+    case ISD::SRA_PARTS:          return lowerShiftRightParts(Op, DAG, true);
+    case ISD::SRL_PARTS:          return lowerShiftRightParts(Op, DAG, false);
+    case ISD::LOAD:               return lowerLOAD(Op, DAG);
+    case ISD::STORE:              return lowerSTORE(Op, DAG);
+    case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+    case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
+    case ISD::ADD:                return lowerADD(Op, DAG);
   }
   return SDValue();
 }
@@ -1073,11 +929,11 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
 //  Lower helper functions
 //===----------------------------------------------------------------------===//
 
-// AddLiveIn - This helper function adds the specified physical register to the
+// addLiveIn - This helper function adds the specified physical register to the
 // MachineFunction as a live in value.  It also creates a corresponding
 // virtual register for it.
 static unsigned
-AddLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
+addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
 {
   unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
   MF.getRegInfo().addLiveIn(PReg, VReg);
@@ -1085,7 +941,7 @@ AddLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
 }
 
 // Get fp branch code (not opcode) from condition code.
-static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
+static Mips::FPBranchCode getFPBranchCodeFromCond(Mips::CondCode CC) {
   if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
     return Mips::BRANCH_T;
 
@@ -1095,425 +951,6 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
   return Mips::BRANCH_F;
 }
 
-/*
-static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB,
-                                        DebugLoc dl,
-                                        const MipsSubtarget *Subtarget,
-                                        const TargetInstrInfo *TII,
-                                        bool isFPCmp, unsigned Opc) {
-  // There is no need to expand CMov instructions if target has
-  // conditional moves.
-  if (Subtarget->hasCondMov())
-    return BB;
-
-  // To "insert" a SELECT_CC instruction, we actually have to insert the
-  // diamond control-flow pattern.  The incoming instruction knows the
-  // destination vreg to set, the condition code register to branch on, the
-  // true/false values to select between, and a branch opcode to use.
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  //  thisMBB:
-  //  ...
-  //   TrueVal = ...
-  //   setcc r1, r2, r3
-  //   bNE   r1, r0, copy1MBB
-  //   fallthrough --> copy0MBB
-  MachineBasicBlock *thisMBB  = BB;
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, copy0MBB);
-  F->insert(It, sinkMBB);
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(copy0MBB);
-  BB->addSuccessor(sinkMBB);
-
-  // Emit the right instruction according to the type of the operands compared
-  if (isFPCmp)
-    BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
-  else
-    BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(2).getReg())
-      .addReg(Mips::ZERO).addMBB(sinkMBB);
-
-  //  copy0MBB:
-  //   %FalseValue = ...
-  //   # fallthrough to sinkMBB
-  BB = copy0MBB;
-
-  // Update machine-CFG edges
-  BB->addSuccessor(sinkMBB);
-
-  //  sinkMBB:
-  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
-  //  ...
-  BB = sinkMBB;
-
-  if (isFPCmp)
-    BuildMI(*BB, BB->begin(), dl,
-            TII->get(Mips::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB)
-      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB);
-  else
-    BuildMI(*BB, BB->begin(), dl,
-            TII->get(Mips::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(3).getReg()).addMBB(thisMBB)
-      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB);
-
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-}
-*/
-
-MachineBasicBlock *
-MipsTargetLowering::EmitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
-  // $bb:
-  //  bposge32_pseudo $vr0
-  //  =>
-  // $bb:
-  //  bposge32 $tbb
-  // $fbb:
-  //  li $vr2, 0
-  //  b $sink
-  // $tbb:
-  //  li $vr1, 1
-  // $sink:
-  //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
-
-  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const TargetRegisterClass *RC = &Mips::CPURegsRegClass;
-  DebugLoc DL = MI->getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, FBB);
-  F->insert(It, TBB);
-  F->insert(It, Sink);
-
-  // Transfer the remainder of BB and its successor edges to Sink.
-  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
-               BB->end());
-  Sink->transferSuccessorsAndUpdatePHIs(BB);
-
-  // Add successors.
-  BB->addSuccessor(FBB);
-  BB->addSuccessor(TBB);
-  FBB->addSuccessor(Sink);
-  TBB->addSuccessor(Sink);
-
-  // Insert the real bposge32 instruction to $BB.
-  BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
-
-  // Fill $FBB.
-  unsigned VR2 = RegInfo.createVirtualRegister(RC);
-  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
-    .addReg(Mips::ZERO).addImm(0);
-  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
-
-  // Fill $TBB.
-  unsigned VR1 = RegInfo.createVirtualRegister(RC);
-  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
-    .addReg(Mips::ZERO).addImm(1);
-
-  // Insert phi function to $Sink.
-  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
-          MI->getOperand(0).getReg())
-    .addReg(VR2).addMBB(FBB).addReg(VR1).addMBB(TBB);
-
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return Sink;
-}
-
-MachineBasicBlock *MipsTargetLowering::EmitSel16(unsigned Opc, MachineInstr *MI,
-                             MachineBasicBlock *BB) const {
-  if (DontExpandCondPseudos16)
-    return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-  // To "insert" a SELECT_CC instruction, we actually have to insert the
-  // diamond control-flow pattern.  The incoming instruction knows the
-  // destination vreg to set, the condition code register to branch on, the
-  // true/false values to select between, and a branch opcode to use.
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  //  thisMBB:
-  //  ...
-  //   TrueVal = ...
-  //   setcc r1, r2, r3
-  //   bNE   r1, r0, copy1MBB
-  //   fallthrough --> copy0MBB
-  MachineBasicBlock *thisMBB  = BB;
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, copy0MBB);
-  F->insert(It, sinkMBB);
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(copy0MBB);
-  BB->addSuccessor(sinkMBB);
-
-  BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(3).getReg())
-    .addMBB(sinkMBB);
-
-  //  copy0MBB:
-  //   %FalseValue = ...
-  //   # fallthrough to sinkMBB
-  BB = copy0MBB;
-
-  // Update machine-CFG edges
-  BB->addSuccessor(sinkMBB);
-
-  //  sinkMBB:
-  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
-  //  ...
-  BB = sinkMBB;
-
-  BuildMI(*BB, BB->begin(), dl,
-          TII->get(Mips::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
-
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-}
-
-MachineBasicBlock *MipsTargetLowering::EmitSelT16
-  (unsigned Opc1, unsigned Opc2,
-   MachineInstr *MI, MachineBasicBlock *BB) const {
-  if (DontExpandCondPseudos16)
-    return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-  // To "insert" a SELECT_CC instruction, we actually have to insert the
-  // diamond control-flow pattern.  The incoming instruction knows the
-  // destination vreg to set, the condition code register to branch on, the
-  // true/false values to select between, and a branch opcode to use.
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  //  thisMBB:
-  //  ...
-  //   TrueVal = ...
-  //   setcc r1, r2, r3
-  //   bNE   r1, r0, copy1MBB
-  //   fallthrough --> copy0MBB
-  MachineBasicBlock *thisMBB  = BB;
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, copy0MBB);
-  F->insert(It, sinkMBB);
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(copy0MBB);
-  BB->addSuccessor(sinkMBB);
-
-  BuildMI(BB, dl, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
-    .addReg(MI->getOperand(4).getReg());
-  BuildMI(BB, dl, TII->get(Opc1)).addMBB(sinkMBB);
-
-  //  copy0MBB:
-  //   %FalseValue = ...
-  //   # fallthrough to sinkMBB
-  BB = copy0MBB;
-
-  // Update machine-CFG edges
-  BB->addSuccessor(sinkMBB);
-
-  //  sinkMBB:
-  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
-  //  ...
-  BB = sinkMBB;
-
-  BuildMI(*BB, BB->begin(), dl,
-          TII->get(Mips::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
-
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-
-}
-
-
-MachineBasicBlock *MipsTargetLowering::EmitSeliT16
-  (unsigned Opc1, unsigned Opc2,
-   MachineInstr *MI, MachineBasicBlock *BB) const {
-  if (DontExpandCondPseudos16)
-    return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-  // To "insert" a SELECT_CC instruction, we actually have to insert the
-  // diamond control-flow pattern.  The incoming instruction knows the
-  // destination vreg to set, the condition code register to branch on, the
-  // true/false values to select between, and a branch opcode to use.
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  //  thisMBB:
-  //  ...
-  //   TrueVal = ...
-  //   setcc r1, r2, r3
-  //   bNE   r1, r0, copy1MBB
-  //   fallthrough --> copy0MBB
-  MachineBasicBlock *thisMBB  = BB;
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, copy0MBB);
-  F->insert(It, sinkMBB);
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(copy0MBB);
-  BB->addSuccessor(sinkMBB);
-
-  BuildMI(BB, dl, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
-    .addImm(MI->getOperand(4).getImm());
-  BuildMI(BB, dl, TII->get(Opc1)).addMBB(sinkMBB);
-
-  //  copy0MBB:
-  //   %FalseValue = ...
-  //   # fallthrough to sinkMBB
-  BB = copy0MBB;
-
-  // Update machine-CFG edges
-  BB->addSuccessor(sinkMBB);
-
-  //  sinkMBB:
-  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
-  //  ...
-  BB = sinkMBB;
-
-  BuildMI(*BB, BB->begin(), dl,
-          TII->get(Mips::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
-
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-
-}
-
-
-MachineBasicBlock
-  *MipsTargetLowering::EmitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
-                           MachineInstr *MI,
-                           MachineBasicBlock *BB) const {
-  if (DontExpandCondPseudos16)
-    return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  unsigned regX = MI->getOperand(0).getReg();
-  unsigned regY = MI->getOperand(1).getReg();
-  MachineBasicBlock *target = MI->getOperand(2).getMBB();
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addReg(regY);
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-}
-
-
-MachineBasicBlock *MipsTargetLowering::EmitFEXT_T8I8I16_ins(
-  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
-  MachineInstr *MI,  MachineBasicBlock *BB) const {
-  if (DontExpandCondPseudos16)
-    return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  unsigned regX = MI->getOperand(0).getReg();
-  int64_t imm = MI->getOperand(1).getImm();
-  MachineBasicBlock *target = MI->getOperand(2).getMBB();
-  unsigned CmpOpc;
-  if (isUInt<8>(imm))
-    CmpOpc = CmpiOpc;
-  else if (isUInt<16>(imm))
-    CmpOpc = CmpiXOpc;
-  else
-    llvm_unreachable("immediate field not usable");
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addImm(imm);
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-}
-
-
-static unsigned Mips16WhichOp8uOr16simm
-  (unsigned shortOp, unsigned longOp, int64_t Imm) {
-  if (isUInt<8>(Imm))
-    return shortOp;
-  else if (isInt<16>(Imm))
-    return longOp;
-  else
-    llvm_unreachable("immediate field not usable");
-}
-
-MachineBasicBlock *MipsTargetLowering::EmitFEXT_CCRX16_ins(
-  unsigned SltOpc,
-  MachineInstr *MI,  MachineBasicBlock *BB) const {
-  if (DontExpandCondPseudos16)
-    return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  unsigned CC = MI->getOperand(0).getReg();
-  unsigned regX = MI->getOperand(1).getReg();
-  unsigned regY = MI->getOperand(2).getReg();
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-		  TII->get(SltOpc)).addReg(regX).addReg(regY);
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-}
-MachineBasicBlock *MipsTargetLowering::EmitFEXT_CCRXI16_ins(
-  unsigned SltiOpc, unsigned SltiXOpc,
-  MachineInstr *MI,  MachineBasicBlock *BB )const {
-  if (DontExpandCondPseudos16)
-    return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  unsigned CC = MI->getOperand(0).getReg();
-  unsigned regX = MI->getOperand(1).getReg();
-  int64_t Imm = MI->getOperand(2).getImm();
-  unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-          TII->get(SltOpc)).addReg(regX).addImm(Imm);
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
-
-}
 MachineBasicBlock *
 MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
@@ -1522,185 +959,114 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     llvm_unreachable("Unexpected instr type to insert");
   case Mips::ATOMIC_LOAD_ADD_I8:
   case Mips::ATOMIC_LOAD_ADD_I8_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I16:
   case Mips::ATOMIC_LOAD_ADD_I16_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I32:
   case Mips::ATOMIC_LOAD_ADD_I32_P8:
-    return EmitAtomicBinary(MI, BB, 4, Mips::ADDu);
+    return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I64:
   case Mips::ATOMIC_LOAD_ADD_I64_P8:
-    return EmitAtomicBinary(MI, BB, 8, Mips::DADDu);
+    return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
 
   case Mips::ATOMIC_LOAD_AND_I8:
   case Mips::ATOMIC_LOAD_AND_I8_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I16:
   case Mips::ATOMIC_LOAD_AND_I16_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I32:
   case Mips::ATOMIC_LOAD_AND_I32_P8:
-    return EmitAtomicBinary(MI, BB, 4, Mips::AND);
+    return emitAtomicBinary(MI, BB, 4, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I64:
   case Mips::ATOMIC_LOAD_AND_I64_P8:
-    return EmitAtomicBinary(MI, BB, 8, Mips::AND64);
+    return emitAtomicBinary(MI, BB, 8, Mips::AND64);
 
   case Mips::ATOMIC_LOAD_OR_I8:
   case Mips::ATOMIC_LOAD_OR_I8_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I16:
   case Mips::ATOMIC_LOAD_OR_I16_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I32:
   case Mips::ATOMIC_LOAD_OR_I32_P8:
-    return EmitAtomicBinary(MI, BB, 4, Mips::OR);
+    return emitAtomicBinary(MI, BB, 4, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I64:
   case Mips::ATOMIC_LOAD_OR_I64_P8:
-    return EmitAtomicBinary(MI, BB, 8, Mips::OR64);
+    return emitAtomicBinary(MI, BB, 8, Mips::OR64);
 
   case Mips::ATOMIC_LOAD_XOR_I8:
   case Mips::ATOMIC_LOAD_XOR_I8_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I16:
   case Mips::ATOMIC_LOAD_XOR_I16_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I32:
   case Mips::ATOMIC_LOAD_XOR_I32_P8:
-    return EmitAtomicBinary(MI, BB, 4, Mips::XOR);
+    return emitAtomicBinary(MI, BB, 4, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I64:
   case Mips::ATOMIC_LOAD_XOR_I64_P8:
-    return EmitAtomicBinary(MI, BB, 8, Mips::XOR64);
+    return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
 
   case Mips::ATOMIC_LOAD_NAND_I8:
   case Mips::ATOMIC_LOAD_NAND_I8_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 1, 0, true);
+    return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I16:
   case Mips::ATOMIC_LOAD_NAND_I16_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 2, 0, true);
+    return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I32:
   case Mips::ATOMIC_LOAD_NAND_I32_P8:
-    return EmitAtomicBinary(MI, BB, 4, 0, true);
+    return emitAtomicBinary(MI, BB, 4, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I64:
   case Mips::ATOMIC_LOAD_NAND_I64_P8:
-    return EmitAtomicBinary(MI, BB, 8, 0, true);
+    return emitAtomicBinary(MI, BB, 8, 0, true);
 
   case Mips::ATOMIC_LOAD_SUB_I8:
   case Mips::ATOMIC_LOAD_SUB_I8_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I16:
   case Mips::ATOMIC_LOAD_SUB_I16_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I32:
   case Mips::ATOMIC_LOAD_SUB_I32_P8:
-    return EmitAtomicBinary(MI, BB, 4, Mips::SUBu);
+    return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I64:
   case Mips::ATOMIC_LOAD_SUB_I64_P8:
-    return EmitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+    return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
 
   case Mips::ATOMIC_SWAP_I8:
   case Mips::ATOMIC_SWAP_I8_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 1, 0);
+    return emitAtomicBinaryPartword(MI, BB, 1, 0);
   case Mips::ATOMIC_SWAP_I16:
   case Mips::ATOMIC_SWAP_I16_P8:
-    return EmitAtomicBinaryPartword(MI, BB, 2, 0);
+    return emitAtomicBinaryPartword(MI, BB, 2, 0);
   case Mips::ATOMIC_SWAP_I32:
   case Mips::ATOMIC_SWAP_I32_P8:
-    return EmitAtomicBinary(MI, BB, 4, 0);
+    return emitAtomicBinary(MI, BB, 4, 0);
   case Mips::ATOMIC_SWAP_I64:
   case Mips::ATOMIC_SWAP_I64_P8:
-    return EmitAtomicBinary(MI, BB, 8, 0);
+    return emitAtomicBinary(MI, BB, 8, 0);
 
   case Mips::ATOMIC_CMP_SWAP_I8:
   case Mips::ATOMIC_CMP_SWAP_I8_P8:
-    return EmitAtomicCmpSwapPartword(MI, BB, 1);
+    return emitAtomicCmpSwapPartword(MI, BB, 1);
   case Mips::ATOMIC_CMP_SWAP_I16:
   case Mips::ATOMIC_CMP_SWAP_I16_P8:
-    return EmitAtomicCmpSwapPartword(MI, BB, 2);
+    return emitAtomicCmpSwapPartword(MI, BB, 2);
   case Mips::ATOMIC_CMP_SWAP_I32:
   case Mips::ATOMIC_CMP_SWAP_I32_P8:
-    return EmitAtomicCmpSwap(MI, BB, 4);
+    return emitAtomicCmpSwap(MI, BB, 4);
   case Mips::ATOMIC_CMP_SWAP_I64:
   case Mips::ATOMIC_CMP_SWAP_I64_P8:
-    return EmitAtomicCmpSwap(MI, BB, 8);
-  case Mips::BPOSGE32_PSEUDO:
-    return EmitBPOSGE32(MI, BB);
-  case Mips::SelBeqZ:
-    return EmitSel16(Mips::BeqzRxImm16, MI, BB);
-  case Mips::SelBneZ:
-    return EmitSel16(Mips::BnezRxImm16, MI, BB);
-  case Mips::SelTBteqZCmpi:
-    return EmitSeliT16(Mips::BteqzX16, Mips::CmpiRxImmX16, MI, BB);
-  case Mips::SelTBteqZSlti:
-    return EmitSeliT16(Mips::BteqzX16, Mips::SltiRxImmX16, MI, BB);
-  case Mips::SelTBteqZSltiu:
-    return EmitSeliT16(Mips::BteqzX16, Mips::SltiuRxImmX16, MI, BB);
-  case Mips::SelTBtneZCmpi:
-    return EmitSeliT16(Mips::BtnezX16, Mips::CmpiRxImmX16, MI, BB);
-  case Mips::SelTBtneZSlti:
-    return EmitSeliT16(Mips::BtnezX16, Mips::SltiRxImmX16, MI, BB);
-  case Mips::SelTBtneZSltiu:
-    return EmitSeliT16(Mips::BtnezX16, Mips::SltiuRxImmX16, MI, BB);
-  case Mips::SelTBteqZCmp:
-    return EmitSelT16(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
-  case Mips::SelTBteqZSlt:
-    return EmitSelT16(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
-  case Mips::SelTBteqZSltu:
-    return EmitSelT16(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
-  case Mips::SelTBtneZCmp:
-    return EmitSelT16(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
-  case Mips::SelTBtneZSlt:
-    return EmitSelT16(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
-  case Mips::SelTBtneZSltu:
-    return EmitSelT16(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
-  case Mips::BteqzT8CmpX16:
-    return EmitFEXT_T8I816_ins(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
-  case Mips::BteqzT8SltX16:
-    return EmitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
-  case Mips::BteqzT8SltuX16:
-    // TBD: figure out a way to get this or remove the instruction
-    // altogether.
-    return EmitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
-  case Mips::BtnezT8CmpX16:
-    return EmitFEXT_T8I816_ins(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
-  case Mips::BtnezT8SltX16:
-    return EmitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
-  case Mips::BtnezT8SltuX16:
-    // TBD: figure out a way to get this or remove the instruction
-    // altogether.
-    return EmitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
-  case Mips::BteqzT8CmpiX16: return EmitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
-  case Mips::BteqzT8SltiX16: return EmitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
-  case Mips::BteqzT8SltiuX16: return EmitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
-  case Mips::BtnezT8CmpiX16: return EmitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
-  case Mips::BtnezT8SltiX16: return EmitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
-  case Mips::BtnezT8SltiuX16: return EmitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
-    break;
-  case Mips::SltCCRxRy16:
-    return EmitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
-    break;
-  case Mips::SltiCCRxImmX16:
-    return EmitFEXT_CCRXI16_ins
-      (Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
-  case Mips::SltiuCCRxImmX16:
-    return EmitFEXT_CCRXI16_ins
-      (Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
-  case Mips::SltuCCRxRy16:
-    return EmitFEXT_CCRX16_ins
-      (Mips::SltuRxRy16, MI, BB);
+    return emitAtomicCmpSwap(MI, BB, 8);
   }
 }
 
 // This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
 // Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
 MachineBasicBlock *
-MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                                      unsigned Size, unsigned BinOpcode,
                                      bool Nand) const {
   assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
@@ -1709,7 +1075,7 @@ MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
@@ -1765,20 +1131,20 @@ MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   //    sc success, storeval, 0(ptr)
   //    beq success, $0, loopMBB
   BB = loopMBB;
-  BuildMI(BB, dl, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
   if (Nand) {
     //  and andres, oldval, incr
     //  nor storeval, $0, andres
-    BuildMI(BB, dl, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
-    BuildMI(BB, dl, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
+    BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
+    BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
   } else if (BinOpcode) {
     //  <binop> storeval, oldval, incr
-    BuildMI(BB, dl, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
+    BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
   } else {
     StoreVal = Incr;
   }
-  BuildMI(BB, dl, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
-  BuildMI(BB, dl, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
+  BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
@@ -1786,7 +1152,7 @@ MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 }
 
 MachineBasicBlock *
-MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
+MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
                                              MachineBasicBlock *BB,
                                              unsigned Size, unsigned BinOpcode,
                                              bool Nand) const {
@@ -1797,7 +1163,7 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
   unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
   unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
@@ -1856,18 +1222,18 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
   //    sll     incr2,incr,shiftamt
 
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, dl, TII->get(Mips::ADDiu), MaskLSB2)
+  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
     .addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, dl, TII->get(Mips::AND), AlignedAddr)
+  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::ORi), MaskUpper)
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
     .addReg(Mips::ZERO).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLLV), Mask)
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
     .addReg(ShiftAmt).addReg(MaskUpper);
-  BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::SLLV), Incr2).addReg(ShiftAmt).addReg(Incr);
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(ShiftAmt).addReg(Incr);
 
   // atomic.load.binop
   // loopMBB:
@@ -1889,32 +1255,32 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
   //   beq     success,$0,loopMBB
 
   BB = loopMBB;
-  BuildMI(BB, dl, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
     //  and andres, oldval, incr2
     //  nor binopres, $0, andres
     //  and newval, binopres, mask
-    BuildMI(BB, dl, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
-    BuildMI(BB, dl, TII->get(Mips::NOR), BinOpRes)
+    BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
       .addReg(Mips::ZERO).addReg(AndRes);
-    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
   } else if (BinOpcode) {
     //  <binop> binopres, oldval, incr2
     //  and newval, binopres, mask
-    BuildMI(BB, dl, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
-    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+    BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
   } else {// atomic.swap
     //  and newval, incr2, mask
-    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
   }
 
-  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal0)
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
     .addReg(OldVal).addReg(Mask2);
-  BuildMI(BB, dl, TII->get(Mips::OR), StoreVal)
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal0).addReg(NewVal);
-  BuildMI(BB, dl, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(SC), Success)
     .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::BEQ))
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
     .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
 
   //  sinkMBB:
@@ -1925,13 +1291,13 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
   BB = sinkMBB;
   int64_t ShiftImm = (Size == 1) ? 24 : 16;
 
-  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal1)
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
     .addReg(OldVal).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::SRLV), SrlRes)
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
       .addReg(ShiftAmt).addReg(MaskedOldVal1);
-  BuildMI(BB, dl, TII->get(Mips::SLL), SllRes)
+  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
       .addReg(SrlRes).addImm(ShiftImm);
-  BuildMI(BB, dl, TII->get(Mips::SRA), Dest)
+  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
       .addReg(SllRes).addImm(ShiftImm);
 
   MI->eraseFromParent();   // The instruction is gone now.
@@ -1940,7 +1306,7 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
+MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
                                       MachineBasicBlock *BB,
                                       unsigned Size) const {
   assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
@@ -1949,7 +1315,7 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
   unsigned LL, SC, ZERO, BNE, BEQ;
 
   if (Size == 4) {
@@ -2003,17 +1369,17 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   //   ll dest, 0(ptr)
   //   bne dest, oldval, exitMBB
   BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(LL), Dest).addReg(Ptr).addImm(0);
-  BuildMI(BB, dl, TII->get(BNE))
+  BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BNE))
     .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
 
   // loop2MBB:
   //   sc success, newval, 0(ptr)
   //   beq success, $0, loop1MBB
   BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(SC), Success)
     .addReg(NewVal).addReg(Ptr).addImm(0);
-  BuildMI(BB, dl, TII->get(BEQ))
+  BuildMI(BB, DL, TII->get(BEQ))
     .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
 
   MI->eraseFromParent();   // The instruction is gone now.
@@ -2022,7 +1388,7 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
+MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
                                               MachineBasicBlock *BB,
                                               unsigned Size) const {
   assert((Size == 1 || Size == 2) &&
@@ -2032,7 +1398,7 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
   unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
   unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
@@ -2099,24 +1465,24 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
   //    andi    maskednewval,newval,255
   //    sll     shiftednewval,maskednewval,shiftamt
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, dl, TII->get(Mips::ADDiu), MaskLSB2)
+  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
     .addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, dl, TII->get(Mips::AND), AlignedAddr)
+  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::ORi), MaskUpper)
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
     .addReg(Mips::ZERO).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLLV), Mask)
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
     .addReg(ShiftAmt).addReg(MaskUpper);
-  BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), MaskedCmpVal)
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedCmpVal)
     .addReg(CmpVal).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLLV), ShiftedCmpVal)
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedCmpVal)
     .addReg(ShiftAmt).addReg(MaskedCmpVal);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), MaskedNewVal)
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedNewVal)
     .addReg(NewVal).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLLV), ShiftedNewVal)
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
     .addReg(ShiftAmt).addReg(MaskedNewVal);
 
   //  loop1MBB:
@@ -2124,10 +1490,10 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
   //    and     maskedoldval0,oldval,mask
   //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal0)
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
     .addReg(OldVal).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::BNE))
+  BuildMI(BB, DL, TII->get(Mips::BNE))
     .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
 
   //  loop2MBB:
@@ -2136,13 +1502,13 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
   //    sc      success,storeval,0(alignedaddr)
   //    beq     success,$0,loop1MBB
   BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal1)
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
     .addReg(OldVal).addReg(Mask2);
-  BuildMI(BB, dl, TII->get(Mips::OR), StoreVal)
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  BuildMI(BB, dl, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(SC), Success)
       .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::BEQ))
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
       .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
 
   //  sinkMBB:
@@ -2152,11 +1518,11 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
   BB = sinkMBB;
   int64_t ShiftImm = (Size == 1) ? 24 : 16;
 
-  BuildMI(BB, dl, TII->get(Mips::SRLV), SrlRes)
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
       .addReg(ShiftAmt).addReg(MaskedOldVal0);
-  BuildMI(BB, dl, TII->get(Mips::SLL), SllRes)
+  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
       .addReg(SrlRes).addImm(ShiftImm);
-  BuildMI(BB, dl, TII->get(Mips::SRA), Dest)
+  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
       .addReg(SllRes).addImm(ShiftImm);
 
   MI->eraseFromParent();   // The instruction is gone now.
@@ -2167,16 +1533,46 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
 //===----------------------------------------------------------------------===//
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
+SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  EVT PTy = getPointerTy();
+  unsigned EntrySize =
+    DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout());
+
+  Index = DAG.getNode(ISD::MUL, DL, PTy, Index,
+                      DAG.getConstant(EntrySize, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table);
+
+  EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
+  Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
+                        MachinePointerInfo::getJumpTable(), MemVT, false, false,
+                        0);
+  Chain = Addr.getValue(1);
+
+  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || IsN64) {
+    // For PIC, the sequence is:
+    // BRIND(load(Jumptable + index) + RelocBase)
+    // RelocBase can be JumpTable, GOT or some sort of global base.
+    Addr = DAG.getNode(ISD::ADD, DL, PTy, Addr,
+                       getPICJumpTableRelocBase(Table, DAG));
+  }
+
+  return DAG.getNode(ISD::BRIND, DL, MVT::Other, Chain, Addr);
+}
+
 SDValue MipsTargetLowering::
-LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
+lowerBRCOND(SDValue Op, SelectionDAG &DAG) const
 {
   // The first operand is the chain, the second is the condition, the third is
   // the block to branch to if the condition is true.
   SDValue Chain = Op.getOperand(0);
   SDValue Dest = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  DebugLoc DL = Op.getDebugLoc();
 
-  SDValue CondRes = CreateFPCmp(DAG, Op.getOperand(1));
+  SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
 
   // Return if flag is not set by a floating point comparison.
   if (CondRes.getOpcode() != MipsISD::FPCmp)
@@ -2185,27 +1581,27 @@ LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
   SDValue CCNode  = CondRes.getOperand(2);
   Mips::CondCode CC =
     (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
-  SDValue BrCode = DAG.getConstant(GetFPBranchCodeFromCond(CC), MVT::i32);
+  SDValue BrCode = DAG.getConstant(getFPBranchCodeFromCond(CC), MVT::i32);
 
-  return DAG.getNode(MipsISD::FPBrcond, dl, Op.getValueType(), Chain, BrCode,
+  return DAG.getNode(MipsISD::FPBrcond, DL, Op.getValueType(), Chain, BrCode,
                      Dest, CondRes);
 }
 
 SDValue MipsTargetLowering::
-LowerSELECT(SDValue Op, SelectionDAG &DAG) const
+lowerSELECT(SDValue Op, SelectionDAG &DAG) const
 {
-  SDValue Cond = CreateFPCmp(DAG, Op.getOperand(0));
+  SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
 
   // Return if flag is not set by a floating point comparison.
   if (Cond.getOpcode() != MipsISD::FPCmp)
     return Op;
 
-  return CreateCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
+  return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
                       Op.getDebugLoc());
 }
 
 SDValue MipsTargetLowering::
-LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
+lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
   DebugLoc DL = Op.getDebugLoc();
   EVT Ty = Op.getOperand(0).getValueType();
@@ -2217,8 +1613,8 @@ LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
                      Op.getOperand(3));
 }
 
-SDValue MipsTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Cond = CreateFPCmp(DAG, Op);
+SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = createFPCmp(DAG, Op);
 
   assert(Cond.getOpcode() == MipsISD::FPCmp &&
          "Floating point operand expected.");
@@ -2226,13 +1622,13 @@ SDValue MipsTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue True  = DAG.getConstant(1, MVT::i32);
   SDValue False = DAG.getConstant(0, MVT::i32);
 
-  return CreateCMovFP(DAG, Cond, True, False, Op.getDebugLoc());
+  return createCMovFP(DAG, Cond, True, False, Op.getDebugLoc());
 }
 
-SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
+SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
-  DebugLoc dl = Op.getDebugLoc();
+  DebugLoc DL = Op.getDebugLoc();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
@@ -2241,12 +1637,12 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
 
     // %gp_rel relocation
     if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
-      SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+      SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
                                               MipsII::MO_GPREL);
-      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl,
+      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, DL,
                                       DAG.getVTList(MVT::i32), &GA, 1);
       SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
-      return DAG.getNode(ISD::ADD, dl, MVT::i32, GPReg, GPRelNode);
+      return DAG.getNode(ISD::ADD, DL, MVT::i32, GPReg, GPRelNode);
     }
 
     // %hi/%lo relocation
@@ -2264,7 +1660,7 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
                        HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16);
 }
 
-SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
+SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
     return getAddrNonPIC(Op, DAG);
@@ -2273,14 +1669,14 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
 }
 
 SDValue MipsTargetLowering::
-LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
+lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 {
   // If the relocation model is PIC, use the General Dynamic TLS Model or
   // Local Dynamic TLS model, otherwise use the Initial Exec or
   // Local Exec TLS Model.
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  DebugLoc dl = GA->getDebugLoc();
+  DebugLoc DL = GA->getDebugLoc();
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
 
@@ -2291,9 +1687,9 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
                                                       : MipsII::MO_TLSGD;
 
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, Flag);
-    SDValue Argument = DAG.getNode(MipsISD::Wrapper, dl, PtrVT,
-                                   GetGlobalReg(DAG, PtrVT), TGA);
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, Flag);
+    SDValue Argument = DAG.getNode(MipsISD::Wrapper, DL, PtrVT,
+                                   getGlobalReg(DAG, PtrVT), TGA);
     unsigned PtrSize = PtrVT.getSizeInBits();
     IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
 
@@ -2307,9 +1703,9 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 
     TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy,
                   false, false, false, false, 0, CallingConv::C,
-                  /*isTailCall=*/false, /*doesNotRet=*/false,
+                  /*IsTailCall=*/false, /*doesNotRet=*/false,
                   /*isReturnValueUsed=*/true,
-                  TlsGetAddr, Args, DAG, dl);
+                  TlsGetAddr, Args, DAG, DL);
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
@@ -2317,44 +1713,44 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     if (model != TLSModel::LocalDynamic)
       return Ret;
 
-    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                                MipsII::MO_DTPREL_HI);
-    SDValue Hi = DAG.getNode(MipsISD::Hi, dl, PtrVT, TGAHi);
-    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                                MipsII::MO_DTPREL_LO);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, PtrVT, TGALo);
-    SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Ret);
-    return DAG.getNode(ISD::ADD, dl, PtrVT, Add, Lo);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Ret);
+    return DAG.getNode(ISD::ADD, DL, PtrVT, Add, Lo);
   }
 
   SDValue Offset;
   if (model == TLSModel::InitialExec) {
     // Initial Exec TLS Model
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                              MipsII::MO_GOTTPREL);
-    TGA = DAG.getNode(MipsISD::Wrapper, dl, PtrVT, GetGlobalReg(DAG, PtrVT),
+    TGA = DAG.getNode(MipsISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
                       TGA);
-    Offset = DAG.getLoad(PtrVT, dl,
+    Offset = DAG.getLoad(PtrVT, DL,
                          DAG.getEntryNode(), TGA, MachinePointerInfo(),
                          false, false, false, 0);
   } else {
     // Local Exec TLS Model
     assert(model == TLSModel::LocalExec);
-    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                                MipsII::MO_TPREL_HI);
-    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                                MipsII::MO_TPREL_LO);
-    SDValue Hi = DAG.getNode(MipsISD::Hi, dl, PtrVT, TGAHi);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, PtrVT, TGALo);
-    Offset = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
   }
 
-  SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, dl, PtrVT);
-  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+  SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadPointer, Offset);
 }
 
 SDValue MipsTargetLowering::
-LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 {
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
     return getAddrNonPIC(Op, DAG);
@@ -2363,7 +1759,7 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 }
 
 SDValue MipsTargetLowering::
-LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
 {
   // gp_rel relocation
   // FIXME: we should reference the constant pool using small data sections,
@@ -2381,22 +1777,22 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   return getAddrLocal(Op, DAG, HasMips64);
 }
 
-SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
 
-  DebugLoc dl = Op.getDebugLoc();
+  DebugLoc DL = Op.getDebugLoc();
   SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                  getPointerTy());
 
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
                       MachinePointerInfo(SV), false, false, 0);
 }
 
-static SDValue LowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   EVT TyX = Op.getOperand(0).getValueType();
   EVT TyY = Op.getOperand(1).getValueType();
   SDValue Const1 = DAG.getConstant(1, MVT::i32);
@@ -2441,7 +1837,7 @@ static SDValue LowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue LowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
   unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
   EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
@@ -2490,14 +1886,14 @@ static SDValue LowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 }
 
 SDValue
-MipsTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64())
-    return LowerFCOPYSIGN64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasMips32r2());
 
-  return LowerFCOPYSIGN32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasMips32r2());
 }
 
-static SDValue LowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
   DebugLoc DL = Op.getDebugLoc();
 
@@ -2526,7 +1922,7 @@ static SDValue LowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue LowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
   DebugLoc DL = Op.getDebugLoc();
 
@@ -2547,15 +1943,15 @@ static SDValue LowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 }
 
 SDValue
-MipsTargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
+MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
-    return LowerFABS64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFABS64(Op, DAG, Subtarget->hasMips32r2());
 
-  return LowerFABS32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFABS32(Op, DAG, Subtarget->hasMips32r2());
 }
 
 SDValue MipsTargetLowering::
-LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   // check the depth
   assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
          "Frame address can only be determined for current frame.");
@@ -2563,13 +1959,13 @@ LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
                                          IsN64 ? Mips::FP_64 : Mips::FP, VT);
   return FrameAddr;
 }
 
-SDValue MipsTargetLowering::LowerRETURNADDR(SDValue Op,
+SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
                                             SelectionDAG &DAG) const {
   // check the depth
   assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
@@ -2590,7 +1986,7 @@ SDValue MipsTargetLowering::LowerRETURNADDR(SDValue Op,
 // generated from __builtin_eh_return (offset, handler)
 // The effect of this is to adjust the stack pointer by "offset"
 // and then branch to "handler".
-SDValue MipsTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
                                                                      const {
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
@@ -2616,24 +2012,24 @@ SDValue MipsTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
 
 // TODO: set SType according to the desired memory barrier behavior.
 SDValue
-MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const {
+MipsTargetLowering::lowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const {
   unsigned SType = 0;
-  DebugLoc dl = Op.getDebugLoc();
-  return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
                      DAG.getConstant(SType, MVT::i32));
 }
 
-SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op,
+SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
                                               SelectionDAG &DAG) const {
   // FIXME: Need pseudo-fence for 'singlethread' fences
   // FIXME: Set SType for weaker fences where supported/appropriate.
   unsigned SType = 0;
-  DebugLoc dl = Op.getDebugLoc();
-  return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
                      DAG.getConstant(SType, MVT::i32));
 }
 
-SDValue MipsTargetLowering::LowerShiftLeftParts(SDValue Op,
+SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
                                                 SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
@@ -2664,7 +2060,7 @@ SDValue MipsTargetLowering::LowerShiftLeftParts(SDValue Op,
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
-SDValue MipsTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
                                                  bool IsSRA) const {
   DebugLoc DL = Op.getDebugLoc();
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
@@ -2723,7 +2119,7 @@ static SDValue CreateLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
 }
 
 // Expand an unaligned 32 or 64-bit integer load node.
-SDValue MipsTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LD = cast<LoadSDNode>(Op);
   EVT MemVT = LD->getMemoryVT();
 
@@ -2801,7 +2197,7 @@ static SDValue CreateStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
 }
 
 // Expand an unaligned 32 or 64-bit integer store node.
-SDValue MipsTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *SD = cast<StoreSDNode>(Op);
   EVT MemVT = SD->getMemoryVT();
 
@@ -2849,7 +2245,7 @@ SDValue MipsTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 // v1 = copy hi
 // out64 = merge-values (v0, v1)
 //
-static SDValue LowerDSPIntr(SDValue Op, SelectionDAG &DAG,
+static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG,
                             unsigned Opc, bool HasI64In, bool HasI64Out) {
   DebugLoc DL = Op.getDebugLoc();
   bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
@@ -2894,95 +2290,95 @@ static SDValue LowerDSPIntr(SDValue Op, SelectionDAG &DAG,
   return DAG.getMergeValues(Vals, 2, DL);
 }
 
-SDValue MipsTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+SDValue MipsTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                     SelectionDAG &DAG) const {
   switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
   default:
     return SDValue();
   case Intrinsic::mips_shilo:
-    return LowerDSPIntr(Op, DAG, MipsISD::SHILO, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::SHILO, true, true);
   case Intrinsic::mips_dpau_h_qbl:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBL, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBL, true, true);
   case Intrinsic::mips_dpau_h_qbr:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBR, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBR, true, true);
   case Intrinsic::mips_dpsu_h_qbl:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBL, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBL, true, true);
   case Intrinsic::mips_dpsu_h_qbr:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBR, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBR, true, true);
   case Intrinsic::mips_dpa_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPA_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPA_W_PH, true, true);
   case Intrinsic::mips_dps_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPS_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPS_W_PH, true, true);
   case Intrinsic::mips_dpax_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPAX_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAX_W_PH, true, true);
   case Intrinsic::mips_dpsx_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPSX_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSX_W_PH, true, true);
   case Intrinsic::mips_mulsa_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::MULSA_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSA_W_PH, true, true);
   case Intrinsic::mips_mult:
-    return LowerDSPIntr(Op, DAG, MipsISD::MULT, false, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MULT, false, true);
   case Intrinsic::mips_multu:
-    return LowerDSPIntr(Op, DAG, MipsISD::MULTU, false, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MULTU, false, true);
   case Intrinsic::mips_madd:
-    return LowerDSPIntr(Op, DAG, MipsISD::MADD_DSP, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MADD_DSP, true, true);
   case Intrinsic::mips_maddu:
-    return LowerDSPIntr(Op, DAG, MipsISD::MADDU_DSP, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MADDU_DSP, true, true);
   case Intrinsic::mips_msub:
-    return LowerDSPIntr(Op, DAG, MipsISD::MSUB_DSP, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MSUB_DSP, true, true);
   case Intrinsic::mips_msubu:
-    return LowerDSPIntr(Op, DAG, MipsISD::MSUBU_DSP, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MSUBU_DSP, true, true);
   }
 }
 
-SDValue MipsTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+SDValue MipsTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
   switch (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue()) {
   default:
     return SDValue();
   case Intrinsic::mips_extp:
-    return LowerDSPIntr(Op, DAG, MipsISD::EXTP, true, false);
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTP, true, false);
   case Intrinsic::mips_extpdp:
-    return LowerDSPIntr(Op, DAG, MipsISD::EXTPDP, true, false);
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTPDP, true, false);
   case Intrinsic::mips_extr_w:
-    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_W, true, false);
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_W, true, false);
   case Intrinsic::mips_extr_r_w:
-    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_R_W, true, false);
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_R_W, true, false);
   case Intrinsic::mips_extr_rs_w:
-    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_RS_W, true, false);
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_RS_W, true, false);
   case Intrinsic::mips_extr_s_h:
-    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_S_H, true, false);
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_S_H, true, false);
   case Intrinsic::mips_mthlip:
-    return LowerDSPIntr(Op, DAG, MipsISD::MTHLIP, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MTHLIP, true, true);
   case Intrinsic::mips_mulsaq_s_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::MULSAQ_S_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSAQ_S_W_PH, true, true);
   case Intrinsic::mips_maq_s_w_phl:
-    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHL, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHL, true, true);
   case Intrinsic::mips_maq_s_w_phr:
-    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHR, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHR, true, true);
   case Intrinsic::mips_maq_sa_w_phl:
-    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHL, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHL, true, true);
   case Intrinsic::mips_maq_sa_w_phr:
-    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHR, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHR, true, true);
   case Intrinsic::mips_dpaq_s_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPAQ_S_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_S_W_PH, true, true);
   case Intrinsic::mips_dpsq_s_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPSQ_S_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_S_W_PH, true, true);
   case Intrinsic::mips_dpaq_sa_l_w:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPAQ_SA_L_W, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_SA_L_W, true, true);
   case Intrinsic::mips_dpsq_sa_l_w:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPSQ_SA_L_W, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_SA_L_W, true, true);
   case Intrinsic::mips_dpaqx_s_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPAQX_S_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_S_W_PH, true, true);
   case Intrinsic::mips_dpaqx_sa_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPAQX_SA_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_SA_W_PH, true, true);
   case Intrinsic::mips_dpsqx_s_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH, true, true);
   case Intrinsic::mips_dpsqx_sa_w_ph:
-    return LowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH, true, true);
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH, true, true);
   }
 }
 
-SDValue MipsTargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
   if (Op->getOperand(0).getOpcode() != ISD::FRAMEADDR
       || cast<ConstantSDNode>
         (Op->getOperand(0).getOperand(0))->getZExtValue() != 0
@@ -3119,28 +2515,6 @@ static unsigned getNextIntArgReg(unsigned Reg) {
   return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
 }
 
-/// IsEligibleForTailCallOptimization - Check whether the call is eligible
-/// for tail call optimization.
-bool MipsTargetLowering::
-IsEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                  unsigned NextStackOffset,
-                                  const MipsFunctionInfo& FI) const {
-  if (!EnableMipsTailCalls)
-    return false;
-
-  // No tail call optimization for mips16.
-  if (Subtarget->inMips16Mode())
-    return false;
-
-  // Return false if either the callee or caller has a byval argument.
-  if (MipsCCInfo.hasByValArg() || FI.hasByvalArg())
-    return false;
-
-  // Return true if the callee's argument area is no larger than the
-  // caller's.
-  return NextStackOffset <= FI.getIncomingArgSize();
-}
-
 SDValue
 MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
                                    SDValue Chain, SDValue Arg, DebugLoc DL,
@@ -3159,161 +2533,48 @@ MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
                       /*isVolatile=*/ true, false, 0);
 }
 
-//
-// The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
-// cleaner way to do all of this but it will have to wait until the traditional
-// gcc mechanism is completed.
-//
-// For Pic, in order for Mips16 code to call Mips32 code which according the abi
-// have either arguments or returned values placed in floating point registers,
-// we use a set of helper functions. (This includes functions which return type
-//  complex which on Mips are returned in a pair of floating point registers).
-//
-// This is an encoding that we inherited from gcc.
-// In Mips traditional O32, N32 ABI, floating point numbers are passed in
-// floating point argument registers 1,2 only when the first and optionally
-// the second arguments are float (sf) or double (df).
-// For Mips16 we are only concerned with the situations where floating point
-// arguments are being passed in floating point registers by the ABI, because
-// Mips16 mode code cannot execute floating point instructions to load those
-// values and hence helper functions are needed.
-// The possibilities are (), (sf), (sf, sf), (sf, df), (df), (df, sf), (df, df)
-// the helper function suffixs for these are:
-//                        0,  1,    5,        9,         2,   6,        10
-// this suffix can then be calculated as follows:
-// for a given argument Arg:
-//     Arg1x, Arg2x = 1 :  Arg is sf
-//                    2 :  Arg is df
-//                    0:   Arg is neither sf or df
-// So this stub is the string for number Arg1x + Arg2x*4.
-// However not all numbers between 0 and 10 are possible, we check anyway and
-// assert if the impossible exists.
-//
-
-unsigned int MipsTargetLowering::getMips16HelperFunctionStubNumber
-  (ArgListTy &Args) const {
-  unsigned int resultNum = 0;
-  if (Args.size() >= 1) {
-    Type *t = Args[0].Ty;
-    if (t->isFloatTy()) {
-      resultNum = 1;
-    }
-    else if (t->isDoubleTy()) {
-      resultNum = 2;
-    }
-  }
-  if (resultNum) {
-    if (Args.size() >=2) {
-      Type *t = Args[1].Ty;
-      if (t->isFloatTy()) {
-        resultNum += 4;
-      }
-      else if (t->isDoubleTy()) {
-        resultNum += 8;
-      }
-    }
+void MipsTargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  // Insert node "GP copy globalreg" before call to function.
+  //
+  // R_MIPS_CALL* operators (emitted when non-internal functions are called
+  // in PIC mode) allow symbols to be resolved via lazy binding.
+  // The lazy binding stub requires GP to point to the GOT.
+  if (IsPICCall && !InternalLinkage) {
+    unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP;
+    EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+    RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
   }
-  return resultNum;
-}
 
-//
-// prefixs are attached to stub numbers depending on the return type .
-// return type: float  sf_
-//              double df_
-//              single complex sc_
-//              double complext dc_
-//              others  NO PREFIX
-//
-//
-// The full name of a helper function is__mips16_call_stub +
-//    return type dependent prefix + stub number
-//
-//
-// This is something that probably should be in a different source file and
-// perhaps done differently but my main purpose is to not waste runtime
-// on something that we can enumerate in the source. Another possibility is
-// to have a python script to generate these mapping tables. This will do
-// for now. There are a whole series of helper function mapping arrays, one
-// for each return type class as outlined above. There there are 11 possible
-//  entries. Ones with 0 are ones which should never be selected
-//
-// All the arrays are similar except for ones which return neither
-// sf, df, sc, dc, in which only care about ones which have sf or df as a
-// first parameter.
-//
-#define P_ "__mips16_call_stub_"
-#define MAX_STUB_NUMBER 10
-#define T1 P "1", P "2", 0, 0, P "5", P "6", 0, 0, P "9", P "10"
-#define T P "0" , T1
-#define P P_
-static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
-  {0, T1 };
-#undef P
-#define P P_ "sf_"
-static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
-  { T };
-#undef P
-#define P P_ "df_"
-static char const * dfMips16Helper[MAX_STUB_NUMBER+1] =
-  { T };
-#undef P
-#define P P_ "sc_"
-static char const * scMips16Helper[MAX_STUB_NUMBER+1] =
-  { T };
-#undef P
-#define P P_ "dc_"
-static char const * dcMips16Helper[MAX_STUB_NUMBER+1] =
-  { T };
-#undef P
-#undef P_
-
-
-const char* MipsTargetLowering::
-  getMips16HelperFunction
-    (Type* RetTy, ArgListTy &Args, bool &needHelper) const {
-  const unsigned int stubNum = getMips16HelperFunctionStubNumber(Args);
-#ifndef NDEBUG
-  const unsigned int maxStubNum = 10;
-  assert(stubNum <= maxStubNum);
-  const bool validStubNum[maxStubNum+1] =
-    {true, true, true, false, false, true, true, false, false, true, true};
-  assert(validStubNum[stubNum]);
-#endif
-  const char *result;
-  if (RetTy->isFloatTy()) {
-    result = sfMips16Helper[stubNum];
-  }
-  else if (RetTy ->isDoubleTy()) {
-    result = dfMips16Helper[stubNum];
-  }
-  else if (RetTy->isStructTy()) {
-    // check if it's complex
-    if (RetTy->getNumContainedTypes() == 2) {
-      if ((RetTy->getContainedType(0)->isFloatTy()) &&
-          (RetTy->getContainedType(1)->isFloatTy())) {
-        result = scMips16Helper[stubNum];
-      }
-      else if ((RetTy->getContainedType(0)->isDoubleTy()) &&
-               (RetTy->getContainedType(1)->isDoubleTy())) {
-        result = dcMips16Helper[stubNum];
-      }
-      else {
-        llvm_unreachable("Uncovered condition");
-      }
-    }
-    else {
-      llvm_unreachable("Uncovered condition");
-    }
-  }
-  else {
-    if (stubNum == 0) {
-      needHelper = false;
-      return "";
-    }
-    result = vMips16Helper[stubNum];
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first,
+                                 RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
   }
-  needHelper = true;
-  return result;
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first,
+                                      RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(CLI.DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
 }
 
 /// LowerCall - functions arguments are copied from virtual regs to
@@ -3322,36 +2583,16 @@ SDValue
 MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  DebugLoc &DL                          = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
-  bool &isTailCall                      = CLI.IsTailCall;
+  bool &IsTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
-  bool isVarArg                         = CLI.IsVarArg;
-
-  const char* mips16HelperFunction = 0;
-  bool needMips16Helper = false;
-
-  if (Subtarget->inMips16Mode() && getTargetMachine().Options.UseSoftFloat &&
-      Mips16HardFloat) {
-    //
-    // currently we don't have symbols tagged with the mips16 or mips32
-    // qualifier so we will assume that we don't know what kind it is.
-    // and generate the helper
-    //
-    bool lookupHelper = true;
-    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-      if (noHelperNeeded.find(S->getSymbol()) != noHelperNeeded.end()) {
-        lookupHelper = false;
-      }
-    }
-    if (lookupHelper) mips16HelperFunction =
-      getMips16HelperFunction(CLI.RetTy, CLI.Args, needMips16Helper);
+  bool IsVarArg                         = CLI.IsVarArg;
 
-  }
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
@@ -3359,22 +2600,24 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
   MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
 
-  MipsCCInfo.analyzeCallOperands(Outs, isVarArg);
+  MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
+                                 getTargetMachine().Options.UseSoftFloat,
+                                 Callee.getNode(), CLI.Args);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
 
   // Check if it's really possible to do a tail call.
-  if (isTailCall)
-    isTailCall =
-      IsEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
+  if (IsTailCall)
+    IsTailCall =
+      isEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
                                         *MF.getInfo<MipsFunctionInfo>());
 
-  if (isTailCall)
+  if (IsTailCall)
     ++NumTailCalls;
 
   // Chain is the output chain of the last Load/Store or CopyToReg node.
@@ -3384,10 +2627,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   NextStackOffset = RoundUpToAlignment(NextStackOffset, StackAlignment);
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
 
-  if (!isTailCall)
+  if (!IsTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl,
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL,
                                         IsN64 ? Mips::SP_64 : Mips::SP,
                                         getPointerTy());
 
@@ -3408,9 +2651,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
       assert(ByValArg != MipsCCInfo.byval_end());
-      assert(!isTailCall &&
+      assert(!IsTailCall &&
              "Do not tail-call optimize if there is a byval argument.");
-      passByValArg(Chain, dl, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
+      passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
                    MipsCCInfo, *ByValArg, Flags, Subtarget->isLittle());
       ++ByValArg;
       continue;
@@ -3422,12 +2665,13 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     case CCValAssign::Full:
       if (VA.isRegLoc()) {
         if ((ValVT == MVT::f32 && LocVT == MVT::i32) ||
-            (ValVT == MVT::f64 && LocVT == MVT::i64))
-          Arg = DAG.getNode(ISD::BITCAST, dl, LocVT, Arg);
+            (ValVT == MVT::f64 && LocVT == MVT::i64) ||
+            (ValVT == MVT::i64 && LocVT == MVT::f64))
+          Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
         else if (ValVT == MVT::f64 && LocVT == MVT::i32) {
-          SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+          SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
                                    Arg, DAG.getConstant(0, MVT::i32));
-          SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+          SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
                                    Arg, DAG.getConstant(1, MVT::i32));
           if (!Subtarget->isLittle())
             std::swap(Lo, Hi);
@@ -3440,13 +2684,13 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
       break;
     case CCValAssign::SExt:
-      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, LocVT, Arg);
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, LocVT, Arg);
       break;
     case CCValAssign::ZExt:
-      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, LocVT, Arg);
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, LocVT, Arg);
       break;
     case CCValAssign::AExt:
-      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, LocVT, Arg);
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, LocVT, Arg);
       break;
     }
 
@@ -3463,13 +2707,13 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // emit ISD::STORE whichs stores the
     // parameter value to a stack Location
     MemOpChains.push_back(passArgOnStack(StackPtr, VA.getLocMemOffset(),
-                                         Chain, Arg, dl, isTailCall, DAG));
+                                         Chain, Arg, DL, IsTailCall, DAG));
   }
 
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
@@ -3491,7 +2735,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       else
         Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
     } else
-      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy(), 0,
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
                                           MipsII::MO_NO_FLAG);
     GlobalOrExternal = true;
   }
@@ -3508,80 +2752,17 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     GlobalOrExternal = true;
   }
 
-  SDValue JumpTarget = Callee;
-
-  // T9 should contain the address of the callee function if
-  // -reloction-model=pic or it is an indirect call.
-  if (IsPICCall || !GlobalOrExternal) {
-    unsigned T9Reg = IsN64 ? Mips::T9_64 : Mips::T9;
-    unsigned V0Reg = Mips::V0;
-    if (needMips16Helper) {
-      RegsToPass.push_front(std::make_pair(V0Reg, Callee));
-      JumpTarget = DAG.getExternalSymbol(
-        mips16HelperFunction, getPointerTy());
-      JumpTarget = getAddrGlobal(JumpTarget, DAG, MipsII::MO_GOT);
-    }
-    else {
-      RegsToPass.push_front(std::make_pair(T9Reg, Callee));
-
-      if (!Subtarget->inMips16Mode())
-        JumpTarget = SDValue();
-    }
-  }
-
-  // Insert node "GP copy globalreg" before call to function.
-  //
-  // R_MIPS_CALL* operators (emitted when non-internal functions are called
-  // in PIC mode) allow symbols to be resolved via lazy binding.
-  // The lazy binding stub requires GP to point to the GOT.
-  if (IsPICCall && !InternalLinkage) {
-    unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP;
-    EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
-    RegsToPass.push_back(std::make_pair(GPReg, GetGlobalReg(DAG, Ty)));
-  }
-
-  // Build a sequence of copy-to-reg nodes chained together with token
-  // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emitted instructions must be
-  // stuck together.
-  SDValue InFlag;
-
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
-  // MipsJmpLink = #chain, #target_address, #opt_in_flags...
-  //             = Chain, Callee, Reg#1, Reg#2, ...
-  //
-  // Returns a chain & a flag for retval copy to use.
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SmallVector<SDValue, 8> Ops(1, Chain);
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  if (JumpTarget.getNode())
-    Ops.push_back(JumpTarget);
-
-  // Add argument registers to the end of the list so that they are
-  // known live into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
-
-  // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
+              CLI, Callee, Chain);
 
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
+  if (IsTailCall)
+    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, &Ops[0], Ops.size());
 
-  if (isTailCall)
-    return DAG.getNode(MipsISD::TailCall, dl, MVT::Other, &Ops[0], Ops.size());
-
-  Chain  = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
+  Chain  = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, &Ops[0], Ops.size());
+  SDValue InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
@@ -3590,31 +2771,40 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
-                         Ins, dl, DAG, InVals);
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg,
+                         Ins, DL, DAG, InVals, CLI.Callee.getNode(), CLI.RetTy);
 }
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 SDValue
 MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                    CallingConv::ID CallConv, bool isVarArg,
+                                    CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals) const {
+                                    DebugLoc DL, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals,
+                                    const SDNode *CallNode,
+                                    const Type *RetTy) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
+  MipsCCInfo.analyzeCallResult(Ins, getTargetMachine().Options.UseSoftFloat,
+                               CallNode, RetTy);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                               RVLocs[i].getValVT(), InFlag).getValue(1);
-    InFlag = Chain.getValue(2);
-    InVals.push_back(Chain.getValue(0));
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, RVLocs[i].getLocReg(),
+                                     RVLocs[i].getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getValVT(), Val);
+
+    InVals.push_back(Val);
   }
 
   return Chain;
@@ -3628,9 +2818,9 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 SDValue
 MipsTargetLowering::LowerFormalArguments(SDValue Chain,
                                          CallingConv::ID CallConv,
-                                         bool isVarArg,
+                                         bool IsVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                         DebugLoc dl, SelectionDAG &DAG,
+                                         DebugLoc DL, SelectionDAG &DAG,
                                          SmallVectorImpl<SDValue> &InVals)
                                           const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3644,16 +2834,17 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
   MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  Function::const_arg_iterator FuncArg =
+    DAG.getMachineFunction().getFunction()->arg_begin();
+  bool UseSoftFloat = getTargetMachine().Options.UseSoftFloat;
 
-  MipsCCInfo.analyzeFormalArguments(Ins);
+  MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
                            MipsCCInfo.hasByValArg());
 
-  Function::const_arg_iterator FuncArg =
-    DAG.getMachineFunction().getFunction()->arg_begin();
   unsigned CurArgIdx = 0;
   MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
 
@@ -3669,7 +2860,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
       assert(ByValArg != MipsCCInfo.byval_end());
-      copyByValRegs(Chain, dl, OutChains, DAG, Flags, InVals, &*FuncArg,
+      copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
                     MipsCCInfo, *ByValArg);
       ++ByValArg;
       continue;
@@ -3695,8 +2886,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Transform the arguments stored on
       // physical registers into virtual ones
-      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgReg, RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
       // If this is an 8 or 16-bit value, it has been passed promoted
       // to 32 bits.  Insert an assert[sz]ext to capture this, then
@@ -3708,22 +2899,24 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
         else if (VA.getLocInfo() == CCValAssign::ZExt)
           Opcode = ISD::AssertZext;
         if (Opcode)
-          ArgValue = DAG.getNode(Opcode, dl, RegVT, ArgValue,
+          ArgValue = DAG.getNode(Opcode, DL, RegVT, ArgValue,
                                  DAG.getValueType(ValVT));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
+        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, ValVT, ArgValue);
       }
 
-      // Handle floating point arguments passed in integer registers.
+      // Handle floating point arguments passed in integer registers and
+      // long double arguments passed in floating point registers.
       if ((RegVT == MVT::i32 && ValVT == MVT::f32) ||
-          (RegVT == MVT::i64 && ValVT == MVT::f64))
-        ArgValue = DAG.getNode(ISD::BITCAST, dl, ValVT, ArgValue);
+          (RegVT == MVT::i64 && ValVT == MVT::f64) ||
+          (RegVT == MVT::f64 && ValVT == MVT::i64))
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
       else if (IsO32 && RegVT == MVT::i32 && ValVT == MVT::f64) {
-        unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(),
+        unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
                                   getNextIntArgReg(ArgReg), RC);
-        SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT);
+        SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
         if (!Subtarget->isLittle())
           std::swap(ArgValue, ArgValue2);
-        ArgValue = DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64,
+        ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64,
                                ArgValue, ArgValue2);
       }
 
@@ -3739,7 +2932,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Create load nodes to retrieve arguments from the stack
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(ValVT, dl, Chain, FIN,
+      InVals.push_back(DAG.getLoad(ValVT, DL, Chain, FIN,
                                    MachinePointerInfo::getFixedStack(FI),
                                    false, false, false, 0));
     }
@@ -3755,18 +2948,18 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
         createVirtualRegister(getRegClassFor(IsN64 ? MVT::i64 : MVT::i32));
       MipsFI->setSRetReturnReg(Reg);
     }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
   }
 
-  if (isVarArg)
-    writeVarArgRegs(OutChains, MipsCCInfo, Chain, dl, DAG);
+  if (IsVarArg)
+    writeVarArgRegs(OutChains, MipsCCInfo, Chain, DL, DAG);
 
   // All stores are grouped in one node to allow the matching between
   // the size of Ins and InVals. This only happens when on varg functions
   if (!OutChains.empty()) {
     OutChains.push_back(Chain);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                         &OutChains[0], OutChains.size());
   }
 
@@ -3779,42 +2972,48 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
 bool
 MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-                                   MachineFunction &MF, bool isVarArg,
+                                   MachineFunction &MF, bool IsVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(),
                  RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_Mips);
 }
 
 SDValue
 MipsTargetLowering::LowerReturn(SDValue Chain,
-                                CallingConv::ID CallConv, bool isVarArg,
+                                CallingConv::ID CallConv, bool IsVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                                 const SmallVectorImpl<SDValue> &OutVals,
-                                DebugLoc dl, SelectionDAG &DAG) const {
-
+                                DebugLoc DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
 
-  // Analize return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
+  // Analyze return values.
+  MipsCCInfo.analyzeReturn(Outs, getTargetMachine().Options.UseSoftFloat,
+                           MF.getFunction()->getReturnType());
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue Val = OutVals[i];
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getLocVT(), Val);
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
 
     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
@@ -3825,17 +3024,16 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   // the sret argument into $v0 for the return. We saved the argument into
   // a virtual register in the entry block, so now we copy the value out
   // and into $v0.
-  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
-    MachineFunction &MF      = DAG.getMachineFunction();
+  if (MF.getFunction()->hasStructRetAttr()) {
     MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
     unsigned Reg = MipsFI->getSRetReturnReg();
 
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
     unsigned V0 = IsN64 ? Mips::V0_64 : Mips::V0;
 
-    Chain = DAG.getCopyToReg(Chain, dl, V0, Val, Flag);
+    Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(V0, getPointerTy()));
   }
@@ -3847,7 +3045,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     RetOps.push_back(Flag);
 
   // Return on Mips is always a "jr $ra"
-  return DAG.getNode(MipsISD::Ret, dl, MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, &RetOps[0], RetOps.size());
 }
 
 //===----------------------------------------------------------------------===//
@@ -3880,6 +3078,8 @@ getConstraintType(const std::string &Constraint) const
       case 'l':
       case 'x':
         return C_RegisterClass;
+      case 'R':
+        return C_Memory;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
@@ -3928,6 +3128,9 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     if (isa<ConstantInt>(CallOperandVal))
       weight = CW_Constant;
     break;
+  case 'R':
+    weight = CW_Memory;
+    break;
   }
   return weight;
 }
@@ -4128,6 +3331,46 @@ unsigned MipsTargetLowering::getJumpTableEncoding() const {
   return TargetLowering::getJumpTableEncoding();
 }
 
+/// This function returns true if CallSym is a long double emulation routine.
+static bool isF128SoftLibCall(const char *CallSym) {
+  const char *const LibCalls[] =
+    {"__addtf3", "__divtf3", "__eqtf2", "__extenddftf2", "__extendsftf2",
+     "__fixtfdi", "__fixtfsi", "__fixtfti", "__fixunstfdi", "__fixunstfsi",
+     "__fixunstfti", "__floatditf", "__floatsitf", "__floattitf",
+     "__floatunditf", "__floatunsitf", "__floatuntitf", "__getf2", "__gttf2",
+     "__letf2", "__lttf2", "__multf3", "__netf2", "__powitf2", "__subtf3",
+     "__trunctfdf2", "__trunctfsf2", "__unordtf2",
+     "ceill", "copysignl", "cosl", "exp2l", "expl", "floorl", "fmal", "fmodl",
+     "log10l", "log2l", "logl", "nearbyintl", "powl", "rintl", "sinl", "sqrtl",
+     "truncl"};
+
+  const char * const *End = LibCalls + array_lengthof(LibCalls);
+
+  // Check that LibCalls is sorted alphabetically.
+  MipsTargetLowering::LTStr Comp;
+
+#ifndef NDEBUG
+  for (const char * const *I = LibCalls; I < End - 1; ++I)
+    assert(Comp(*I, *(I + 1)));
+#endif
+
+  return std::binary_search(LibCalls, End, CallSym, Comp);
+}
+
+/// This function returns true if Ty is fp128 or i128 which was originally a
+/// fp128.
+static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
+  if (Ty->isFP128Ty())
+    return true;
+
+  const ExternalSymbolSDNode *ES =
+    dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
+
+  // If the Ty is i128 and the function being called is a long double emulation
+  // routine, then the original type is f128.
+  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+}
+
 MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CC, bool IsO32_,
                                    CCState &Info)
   : CCInfo(Info), CallConv(CC), IsO32(IsO32_) {
@@ -4137,7 +3380,8 @@ MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CC, bool IsO32_,
 
 void MipsTargetLowering::MipsCC::
 analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
-                    bool IsVarArg) {
+                    bool IsVarArg, bool IsSoftFloat, const SDNode *CallNode,
+                    std::vector<ArgListEntry> &FuncArgs) {
   assert((CallConv != CallingConv::Fast || !IsVarArg) &&
          "CallingConv::Fast shouldn't be used for vararg functions.");
 
@@ -4156,8 +3400,11 @@ analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
 
     if (IsVarArg && !Args[I].IsFixed)
       R = VarFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-    else
-      R = FixedFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+    else {
+      MVT RegVT = getRegVT(ArgVT, FuncArgs[Args[I].OrigArgIndex].Ty, CallNode,
+                           IsSoftFloat);
+      R = FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo);
+    }
 
     if (R) {
 #ifndef NDEBUG
@@ -4170,20 +3417,26 @@ analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
 }
 
 void MipsTargetLowering::MipsCC::
-analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args) {
+analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
+                       bool IsSoftFloat, Function::const_arg_iterator FuncArg) {
   unsigned NumArgs = Args.size();
   llvm::CCAssignFn *FixedFn = fixedArgFn();
+  unsigned CurArgIdx = 0;
 
   for (unsigned I = 0; I != NumArgs; ++I) {
     MVT ArgVT = Args[I].VT;
     ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
+    std::advance(FuncArg, Args[I].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Args[I].OrigArgIndex;
 
     if (ArgFlags.isByVal()) {
       handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
       continue;
     }
 
-    if (!FixedFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo))
+    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), 0, IsSoftFloat);
+
+    if (!FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo))
       continue;
 
 #ifndef NDEBUG
@@ -4194,6 +3447,44 @@ analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args) {
   }
 }
 
+template<typename Ty>
+void MipsTargetLowering::MipsCC::
+analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
+              const SDNode *CallNode, const Type *RetTy) const {
+  CCAssignFn *Fn;
+
+  if (IsSoftFloat && originalTypeIsF128(RetTy, CallNode))
+    Fn = RetCC_F128Soft;
+  else
+    Fn = RetCC_Mips;
+
+  for (unsigned I = 0, E = RetVals.size(); I < E; ++I) {
+    MVT VT = RetVals[I].VT;
+    ISD::ArgFlagsTy Flags = RetVals[I].Flags;
+    MVT RegVT = this->getRegVT(VT, RetTy, CallNode, IsSoftFloat);
+
+    if (Fn(I, VT, RegVT, CCValAssign::Full, Flags, this->CCInfo)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << I << " has unhandled type "
+             << EVT(VT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsSoftFloat,
+                  const SDNode *CallNode, const Type *RetTy) const {
+  analyzeReturn(Ins, IsSoftFloat, CallNode, RetTy);
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
+              const Type *RetTy) const {
+  analyzeReturn(Outs, IsSoftFloat, 0, RetTy);
+}
+
 void
 MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
                                            MVT LocVT,
@@ -4268,6 +3559,21 @@ void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
     CCInfo.AllocateReg(IntArgRegs[I], ShadowRegs[I]);
 }
 
+MVT MipsTargetLowering::MipsCC::getRegVT(MVT VT, const Type *OrigTy,
+                                         const SDNode *CallNode,
+                                         bool IsSoftFloat) const {
+  if (IsSoftFloat || IsO32)
+    return VT;
+
+  // Check if the original type was fp128.
+  if (originalTypeIsF128(OrigTy, CallNode)) {
+    assert(VT == MVT::i64);
+    return MVT::f64;
+  }
+
+  return VT;
+}
+
 void MipsTargetLowering::
 copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
               SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
@@ -4300,7 +3606,7 @@ copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
 
   for (unsigned I = 0; I < ByVal.NumRegs; ++I) {
     unsigned ArgReg = CC.intArgRegs()[ByVal.FirstIdx + I];
-    unsigned VReg = AddLiveIn(MF, ArgReg, RC);
+    unsigned VReg = addLiveIn(MF, ArgReg, RC);
     unsigned Offset = I * CC.regSize();
     SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
                                    DAG.getConstant(Offset, PtrTy));
@@ -4442,7 +3748,7 @@ MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
   // in the caller's stack frame, while for N32/64, it is allocated in the
   // callee's stack frame.
   for (unsigned I = Idx; I < NumRegs; ++I, VaArgOffset += RegSize) {
-    unsigned Reg = AddLiveIn(MF, ArgRegs[I], RC);
+    unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
     SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
     FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
     SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index f0f3782..71977d7 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -19,6 +19,7 @@
 #include "MipsSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Target/TargetLowering.h"
 #include <deque>
 #include <string>
@@ -151,9 +152,9 @@ namespace llvm {
   public:
     explicit MipsTargetLowering(MipsTargetMachine &TM);
 
-    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    static const MipsTargetLowering *create(MipsTargetMachine &TM);
 
-    virtual bool allowsUnalignedMemoryAccesses (EVT VT, bool *Fast) const;
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     virtual void LowerOperationWrapper(SDNode *N,
                                        SmallVectorImpl<SDValue> &Results,
@@ -176,17 +177,34 @@ namespace llvm {
     EVT getSetCCResultType(EVT VT) const;
 
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  private:
 
-    void SetMips16LibcallName(RTLIB::Libcall, const char *Name);
+    virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+    struct LTStr {
+      bool operator()(const char *S1, const char *S2) const {
+        return strcmp(S1, S2) < 0;
+      }
+    };
+
+  protected:
+    SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
 
-    void setMips16HardFloatLibCalls();
+    SDValue getAddrLocal(SDValue Op, SelectionDAG &DAG, bool HasMips64) const;
 
-    unsigned int
-      getMips16HelperFunctionStubNumber(ArgListTy &Args) const;
+    SDValue getAddrGlobal(SDValue Op, SelectionDAG &DAG, unsigned Flag) const;
 
-    const char *getMips16HelperFunction
-      (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
+    SDValue getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
+                                  unsigned HiFlag, unsigned LoFlag) const;
+
+    /// This function fills Ops, which is the list of operands that will later
+    /// be used when a function call node is created. It also generates
+    /// copyToReg nodes to set up argument registers.
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
 
     /// ByValArgInfo - Byval argument information.
     struct ByValArgInfo {
@@ -204,8 +222,20 @@ namespace llvm {
       MipsCC(CallingConv::ID CallConv, bool IsO32, CCState &Info);
 
       void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                               bool IsVarArg);
-      void analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins);
+                               bool IsVarArg, bool IsSoftFloat,
+                               const SDNode *CallNode,
+                               std::vector<ArgListEntry> &FuncArgs);
+      void analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  bool IsSoftFloat,
+                                  Function::const_arg_iterator FuncArg);
+
+      void analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                             bool IsSoftFloat, const SDNode *CallNode,
+                             const Type *RetTy) const;
+
+      void analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         bool IsSoftFloat, const Type *RetTy) const;
+
       const CCState &getCCInfo() const { return CCInfo; }
 
       /// hasByValArg - Returns true if function has byval arguments.
@@ -248,6 +278,17 @@ namespace llvm {
       void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
                         unsigned Align);
 
+      /// Return the type of the register which is used to pass an argument or
+      /// return a value. This function returns f64 if the argument is an i64
+      /// value which has been generated as a result of softening an f128 value.
+      /// Otherwise, it just returns VT.
+      MVT getRegVT(MVT VT, const Type *OrigTy, const SDNode *CallNode,
+                   bool IsSoftFloat) const;
+
+      template<typename Ty>
+      void analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
+                         const SDNode *CallNode, const Type *RetTy) const;
+
       CCState &CCInfo;
       CallingConv::ID CallConv;
       bool IsO32;
@@ -259,45 +300,49 @@ namespace llvm {
 
     bool HasMips64, IsN64, IsO32;
 
+  private:
     // Lower Operand helpers
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             DebugLoc dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals) const;
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SDNode *CallNode, const Type *RetTy) const;
 
     // Lower Operand specifics
-    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
-    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
-    SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
-    SDValue LowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
+    SDValue lowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
                                  bool IsSRA) const;
-    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerADD(SDValue Op, SelectionDAG &DAG) const;
 
-    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// isEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
-    bool IsEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                           unsigned NextStackOffset,
-                                           const MipsFunctionInfo& FI) const;
+    virtual bool
+    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo& FI) const = 0;
 
     /// copyByValArg - Copy argument registers which were used to pass a byval
     /// argument to the stack. Create a stack frame object for the byval
@@ -351,10 +396,6 @@ namespace llvm {
                   const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
-    virtual MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
-
     // Inline asm support
     ConstraintType getConstraintType(const std::string &Constraint) const;
 
@@ -393,40 +434,20 @@ namespace llvm {
 
     virtual unsigned getJumpTableEncoding() const;
 
-    MachineBasicBlock *EmitBPOSGE32(MachineInstr *MI,
-                                    MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+    MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                     unsigned Size, unsigned BinOpcode, bool Nand = false) const;
-    MachineBasicBlock *EmitAtomicBinaryPartword(MachineInstr *MI,
+    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr *MI,
                     MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
                     bool Nand = false) const;
-    MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
+    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr *MI,
                                   MachineBasicBlock *BB, unsigned Size) const;
-    MachineBasicBlock *EmitAtomicCmpSwapPartword(MachineInstr *MI,
+    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI,
                                   MachineBasicBlock *BB, unsigned Size) const;
-    MachineBasicBlock *EmitSel16(unsigned Opc, MachineInstr *MI,
-                                 MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitSeliT16(unsigned Opc1, unsigned Opc2,
-                                  MachineInstr *MI,
-                                  MachineBasicBlock *BB) const;
-
-    MachineBasicBlock *EmitSelT16(unsigned Opc1, unsigned Opc2,
-                                  MachineInstr *MI,
-                                  MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
-                               MachineInstr *MI,
-                               MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitFEXT_T8I8I16_ins(
-      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
-      MachineInstr *MI,  MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitFEXT_CCRX16_ins(
-      unsigned SltOpc,
-      MachineInstr *MI,  MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitFEXT_CCRXI16_ins(
-      unsigned SltiOpc, unsigned SltiXOpc,
-      MachineInstr *MI,  MachineBasicBlock *BB )const;
-
   };
+
+  /// Create MipsTargetLowering objects.
+  const MipsTargetLowering *createMips16TargetLowering(MipsTargetMachine &TM);
+  const MipsTargetLowering *createMipsSETargetLowering(MipsTargetMachine &TM);
 }
 
 #endif // MipsISELLOWERING_H
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 76644c1..ad92d41 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -93,81 +93,11 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                   MachineBasicBlock *&TBB,
                                   MachineBasicBlock *&FBB,
                                   SmallVectorImpl<MachineOperand> &Cond,
-                                  bool AllowModify) const
-{
-
-  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
-
-  // Skip all the debug instructions.
-  while (I != REnd && I->isDebugValue())
-    ++I;
-
-  if (I == REnd || !isUnpredicatedTerminator(&*I)) {
-    // If this block ends with no branches (it just falls through to its succ)
-    // just return false, leaving TBB/FBB null.
-    TBB = FBB = NULL;
-    return false;
-  }
-
-  MachineInstr *LastInst = &*I;
-  unsigned LastOpc = LastInst->getOpcode();
-
-  // Not an analyzable branch (must be an indirect jump).
-  if (!GetAnalyzableBrOpc(LastOpc))
-    return true;
-
-  // Get the second to last instruction in the block.
-  unsigned SecondLastOpc = 0;
-  MachineInstr *SecondLastInst = NULL;
-
-  if (++I != REnd) {
-    SecondLastInst = &*I;
-    SecondLastOpc = GetAnalyzableBrOpc(SecondLastInst->getOpcode());
-
-    // Not an analyzable branch (must be an indirect jump).
-    if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
-      return true;
-  }
-
-  // If there is only one terminator instruction, process it.
-  if (!SecondLastOpc) {
-    // Unconditional branch
-    if (LastOpc == UncondBrOpc) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
-
-    // Conditional branch
-    AnalyzeCondBr(LastInst, LastOpc, TBB, Cond);
-    return false;
-  }
+                                  bool AllowModify) const {
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  BranchType BT = AnalyzeBranch(MBB, TBB, FBB, Cond, AllowModify, BranchInstrs);
 
-  // If we reached here, there are two branches.
-  // If there are three terminators, we don't know what sort of block this is.
-  if (++I != REnd && isUnpredicatedTerminator(&*I))
-    return true;
-
-  // If second to last instruction is an unconditional branch,
-  // analyze it and remove the last instruction.
-  if (SecondLastOpc == UncondBrOpc) {
-    // Return if the last instruction cannot be removed.
-    if (!AllowModify)
-      return true;
-
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    LastInst->eraseFromParent();
-    return false;
-  }
-
-  // Conditional branch followed by an unconditional branch.
-  // The last one must be unconditional.
-  if (LastOpc != UncondBrOpc)
-    return true;
-
-  AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
-  FBB = LastInst->getOperand(0).getMBB();
-
-  return false;
+  return (BT == BT_None) || (BT == BT_Indirect);
 }
 
 void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB,
@@ -256,6 +186,90 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
   return false;
 }
 
+MipsInstrInfo::BranchType MipsInstrInfo::
+AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+              MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond,
+              bool AllowModify,
+              SmallVectorImpl<MachineInstr*> &BranchInstrs) const {
+
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  if (I == REnd || !isUnpredicatedTerminator(&*I)) {
+    // This block ends with no branches (it just falls through to its succ).
+    // Leave TBB/FBB null.
+    TBB = FBB = NULL;
+    return BT_NoBranch;
+  }
+
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+  BranchInstrs.push_back(LastInst);
+
+  // Not an analyzable branch (e.g., indirect jump).
+  if (!GetAnalyzableBrOpc(LastOpc))
+    return LastInst->isIndirectBranch() ? BT_Indirect : BT_None;
+
+  // Get the second to last instruction in the block.
+  unsigned SecondLastOpc = 0;
+  MachineInstr *SecondLastInst = NULL;
+
+  if (++I != REnd) {
+    SecondLastInst = &*I;
+    SecondLastOpc = GetAnalyzableBrOpc(SecondLastInst->getOpcode());
+
+    // Not an analyzable branch (must be an indirect jump).
+    if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
+      return BT_None;
+  }
+
+  // If there is only one terminator instruction, process it.
+  if (!SecondLastOpc) {
+    // Unconditional branch
+    if (LastOpc == UncondBrOpc) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return BT_Uncond;
+    }
+
+    // Conditional branch
+    AnalyzeCondBr(LastInst, LastOpc, TBB, Cond);
+    return BT_Cond;
+  }
+
+  // If we reached here, there are two branches.
+  // If there are three terminators, we don't know what sort of block this is.
+  if (++I != REnd && isUnpredicatedTerminator(&*I))
+    return BT_None;
+
+  BranchInstrs.insert(BranchInstrs.begin(), SecondLastInst);
+
+  // If second to last instruction is an unconditional branch,
+  // analyze it and remove the last instruction.
+  if (SecondLastOpc == UncondBrOpc) {
+    // Return if the last instruction cannot be removed.
+    if (!AllowModify)
+      return BT_None;
+
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    LastInst->eraseFromParent();
+    BranchInstrs.pop_back();
+    return BT_Uncond;
+  }
+
+  // Conditional branch followed by an unconditional branch.
+  // The last one must be unconditional.
+  if (LastOpc != UncondBrOpc)
+    return BT_None;
+
+  AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
+  FBB = LastInst->getOperand(0).getMBB();
+
+  return BT_CondUncond;
+}
+
 /// Return the number of bytes of code the specified instruction may be.
 unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index aca2bc7..3cd9088 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -31,6 +31,15 @@ protected:
   unsigned UncondBrOpc;
 
 public:
+  enum BranchType {
+    BT_None,       // Couldn't analyze branch.
+    BT_NoBranch,   // No branches found.
+    BT_Uncond,     // One unconditional branch.
+    BT_Cond,       // One conditional branch.
+    BT_CondUncond, // A conditional branch followed by an unconditional branch.
+    BT_Indirect    // One indirct branch.
+  };
+
   explicit MipsInstrInfo(MipsTargetMachine &TM, unsigned UncondBrOpc);
 
   static const MipsInstrInfo *create(MipsTargetMachine &TM);
@@ -51,6 +60,12 @@ public:
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
+  BranchType AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                           MachineBasicBlock *&FBB,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           bool AllowModify,
+                           SmallVectorImpl<MachineInstr*> &BranchInstrs) const;
+
   virtual MachineInstr* emitFrameIndexDebugValue(MachineFunction &MF,
                                                  int FrameIx, uint64_t Offset,
                                                  const MDNode *MDPtr,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index de09c9e..25b5d24 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -299,6 +299,9 @@ def HI16 : SDNodeXForm<imm, [{
   return getImm(N, (N->getZExtValue() >> 16) & 0xFFFF);
 }]>;
 
+// Plus 1.
+def Plus1 : SDNodeXForm<imm, [{ return getImm(N, N->getSExtValue() + 1); }]>;
+
 // Node immediate fits as 16-bit sign extended on target immediate.
 // e.g. addi, andi
 def immSExt8  : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
@@ -331,6 +334,11 @@ def immLow16Zero : PatLeaf<(imm), [{
 // shamt field must fit in 5 bits.
 def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
 
+// True if (N + 1) fits in 16-bit field.
+def immSExt16Plus1 : PatLeaf<(imm), [{
+  return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
+}]>;
+
 // Mips Address Mode! SDNode frameindex could possibily be a match
 // since load and store instructions from stack used it.
 def addr :
@@ -977,7 +985,7 @@ def : InstAlias<"move $dst, $src",
                 (ADDu CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 1>,
       Requires<[NotMips64]>;
 def : InstAlias<"move $dst, $src",
-                (OR CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 0>,
+                (OR CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 1>,
       Requires<[NotMips64]>;
 def : InstAlias<"bal $offset", (BGEZAL RA, brtarget:$offset), 1>;
 def : InstAlias<"addu $rs, $rt, $imm",
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 13b2a6a..3c210e7 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -58,7 +58,8 @@ public:
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = NULL) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
 
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index f93dd86..6d76e8a 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -18,6 +18,10 @@ def sub_lo     : SubRegIndex;
 def sub_hi     : SubRegIndex;
 }
 
+class Unallocatable {
+  bit isAllocatable = 0;
+}
+
 // We have banks of 32 registers each.
 class MipsReg<bits<16> Enc, string n> : Register<n> {
   let HWEncoding = Enc;
@@ -291,9 +295,9 @@ def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
   // Callee save
   S0, S1)>;
 
-def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>;
+def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>, Unallocatable;
 
-def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>;
+def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
 
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
@@ -319,18 +323,19 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
 def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
 
 // Condition Register for floating point operations
-def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31,FCC0)>;
+def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31,FCC0)>, Unallocatable;
 
 // Hi/Lo Registers
-def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>;
-def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>;
+def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>, Unallocatable;
+def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>, Unallocatable;
 
 // Hardware registers
-def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
-def HWRegs64 : RegisterClass<"Mips", [i64], 32, (add HWR29_64)>;
+def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>, Unallocatable;
+def HWRegs64 : RegisterClass<"Mips", [i64], 64, (add HWR29_64)>, Unallocatable;
 
 // Accumulator Registers
-def ACRegs : RegisterClass<"Mips", [i64], 64, (sequence "AC%u", 0, 3)>;
+def ACRegs : RegisterClass<"Mips", [i64], 64, (sequence "AC%u", 0, 3)>,
+             Unallocatable;
 
 def CPURegsAsmOperand : AsmOperandClass {
   let Name = "CPURegsAsm";
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
new file mode 100644
index 0000000..e22c3c8
--- /dev/null
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -0,0 +1,460 @@
+//===-- MipsSEISelDAGToDAG.cpp - A Dag to Dag Inst Selector for MipsSE ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+#include "MipsSEISelDAGToDAG.h"
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+
+bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
+                                                const MachineInstr& MI) {
+  unsigned DstReg = 0, ZeroReg = 0;
+
+  // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
+  if ((MI.getOpcode() == Mips::ADDiu) &&
+      (MI.getOperand(1).getReg() == Mips::ZERO) &&
+      (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO;
+  } else if ((MI.getOpcode() == Mips::DADDiu) &&
+             (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+             (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO_64;
+  }
+
+  if (!DstReg)
+    return false;
+
+  // Replace uses with ZeroReg.
+  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+       E = MRI->use_end(); U != E;) {
+    MachineOperand &MO = U.getOperand();
+    unsigned OpNo = U.getOperandNo();
+    MachineInstr *MI = MO.getParent();
+    ++U;
+
+    // Do not replace if it is a phi's operand or is tied to def operand.
+    if (MI->isPHI() || MI->isRegTiedToDefOperand(OpNo) || MI->isPseudo())
+      continue;
+
+    MO.setReg(ZeroReg);
+  }
+
+  return true;
+}
+
+void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC;
+
+  if (Subtarget.isABI_N64())
+    RC = (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
+  else
+    RC = (const TargetRegisterClass*)&Mips::CPURegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+
+  if (Subtarget.isABI_N64()) {
+    MF.getRegInfo().addLiveIn(Mips::T9_64);
+    MBB.addLiveIn(Mips::T9_64);
+
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // daddu $v1, $v0, $t9
+    // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+      .addReg(Mips::T9_64);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  if (MF.getTarget().getRelocationModel() == Reloc::Static) {
+    // Set global register to __gnu_local_gp.
+    //
+    // lui   $v0, %hi(__gnu_local_gp)
+    // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
+    return;
+  }
+
+  MF.getRegInfo().addLiveIn(Mips::T9);
+  MBB.addLiveIn(Mips::T9);
+
+  if (Subtarget.isABI_N32()) {
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // addu $v1, $v0, $t9
+    // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  assert(Subtarget.isABI_O32());
+
+  // For O32 ABI, the following instruction sequence is emitted to initialize
+  // the global base register:
+  //
+  //  0. lui   $2, %hi(_gp_disp)
+  //  1. addiu $2, $2, %lo(_gp_disp)
+  //  2. addu  $globalbasereg, $2, $t9
+  //
+  // We emit only the last instruction here.
+  //
+  // GNU linker requires that the first two instructions appear at the beginning
+  // of a function and no instructions be inserted before or between them.
+  // The two instructions are emitted during lowering to MC layer in order to
+  // avoid any reordering.
+  //
+  // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+  // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+  // reads it.
+  MF.getRegInfo().addLiveIn(Mips::V0);
+  MBB.addLiveIn(Mips::V0);
+  BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+    .addReg(Mips::V0).addReg(Mips::T9);
+}
+
+void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
+       ++MFI)
+    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
+      replaceUsesWithZeroReg(MRI, *I);
+}
+
+/// Select multiply instructions.
+std::pair<SDNode*, SDNode*>
+MipsSEDAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, DebugLoc DL, EVT Ty,
+                               bool HasLo, bool HasHi) {
+  SDNode *Lo = 0, *Hi = 0;
+  SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
+                                       N->getOperand(1));
+  SDValue InFlag = SDValue(Mul, 0);
+
+  if (HasLo) {
+    unsigned Opcode = (Ty == MVT::i32 ? Mips::MFLO : Mips::MFLO64);
+    Lo = CurDAG->getMachineNode(Opcode, DL, Ty, MVT::Glue, InFlag);
+    InFlag = SDValue(Lo, 1);
+  }
+  if (HasHi) {
+    unsigned Opcode = (Ty == MVT::i32 ? Mips::MFHI : Mips::MFHI64);
+    Hi = CurDAG->getMachineNode(Opcode, DL, Ty, InFlag);
+  }
+  return std::make_pair(Lo, Hi);
+}
+
+SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+                                           SDValue CmpLHS, DebugLoc DL,
+                                           SDNode *Node) const {
+  unsigned Opc = InFlag.getOpcode(); (void)Opc;
+
+  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+  SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+
+  SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, DL, VT, Ops, 2);
+  SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, DL, VT,
+                                            SDValue(Carry, 0), RHS);
+  return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
+                              SDValue(AddCarry, 0));
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  EVT ValTy = Addr.getValueType();
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    return true;
+  }
+
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, Addr.getValueType());
+  return true;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  return selectAddrRegImm(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc DL = Node->getDebugLoc();
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  EVT NodeTy = Node->getValueType(0);
+  SDNode *Result;
+  unsigned MultOpc;
+
+  switch(Opcode) {
+  default: break;
+
+  case ISD::SUBE: {
+    SDValue InFlag = Node->getOperand(2);
+    Result = selectAddESubE(Mips::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
+    return std::make_pair(true, Result);
+  }
+
+  case ISD::ADDE: {
+    SDValue InFlag = Node->getOperand(2);
+    Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node);
+    return std::make_pair(true, Result);
+  }
+
+  /// Mul with two results
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    if (NodeTy == MVT::i32)
+      MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
+    else
+      MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::DMULTu : Mips::DMULT);
+
+    std::pair<SDNode*, SDNode*> LoHi = selectMULT(Node, MultOpc, DL, NodeTy,
+                                                  true, true);
+
+    if (!SDValue(Node, 0).use_empty())
+      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
+
+    if (!SDValue(Node, 1).use_empty())
+      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
+
+    return std::make_pair(true, (SDNode*)NULL);
+  }
+
+  /// Special Muls
+  case ISD::MUL: {
+    // Mips32 has a 32-bit three operand mul instruction.
+    if (Subtarget.hasMips32() && NodeTy == MVT::i32)
+      break;
+    MultOpc = NodeTy == MVT::i32 ? Mips::MULT : Mips::DMULT;
+    Result = selectMULT(Node, MultOpc, DL, NodeTy, true, false).first;
+    return std::make_pair(true, Result);
+  }
+  case ISD::MULHS:
+  case ISD::MULHU: {
+    if (NodeTy == MVT::i32)
+      MultOpc = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+    else
+      MultOpc = (Opcode == ISD::MULHU ? Mips::DMULTu : Mips::DMULT);
+
+    Result = selectMULT(Node, MultOpc, DL, NodeTy, false, true).second;
+    return std::make_pair(true, Result);
+  }
+
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+    if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
+      if (Subtarget.hasMips64()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO_64, MVT::i64);
+        Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
+      } else {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        Result = CurDAG->getMachineNode(Mips::BuildPairF64, DL, MVT::f64, Zero,
+                                        Zero);
+      }
+
+      return std::make_pair(true, Result);
+    }
+    break;
+  }
+
+  case ISD::Constant: {
+    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
+    unsigned Size = CN->getValueSizeInBits(0);
+
+    if (Size == 32)
+      break;
+
+    MipsAnalyzeImmediate AnalyzeImm;
+    int64_t Imm = CN->getSExtValue();
+
+    const MipsAnalyzeImmediate::InstSeq &Seq =
+      AnalyzeImm.Analyze(Imm, Size, false);
+
+    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+    DebugLoc DL = CN->getDebugLoc();
+    SDNode *RegOpnd;
+    SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+                                                MVT::i64);
+
+    // The first instruction can be a LUi which is different from other
+    // instructions (ADDiu, ORI and SLL) in that it does not have a register
+    // operand.
+    if (Inst->Opc == Mips::LUi64)
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64, ImmOpnd);
+    else
+      RegOpnd =
+        CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                               CurDAG->getRegister(Mips::ZERO_64, MVT::i64),
+                               ImmOpnd);
+
+    // The remaining instructions in the sequence are handled here.
+    for (++Inst; Inst != Seq.end(); ++Inst) {
+      ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+                                          MVT::i64);
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                                       SDValue(RegOpnd, 0), ImmOpnd);
+    }
+
+    return std::make_pair(true, RegOpnd);
+  }
+
+  case MipsISD::ThreadPointer: {
+    EVT PtrVT = TLI.getPointerTy();
+    unsigned RdhwrOpc, SrcReg, DestReg;
+
+    if (PtrVT == MVT::i32) {
+      RdhwrOpc = Mips::RDHWR;
+      SrcReg = Mips::HWR29;
+      DestReg = Mips::V1;
+    } else {
+      RdhwrOpc = Mips::RDHWR64;
+      SrcReg = Mips::HWR29_64;
+      DestReg = Mips::V1_64;
+    }
+
+    SDNode *Rdhwr =
+      CurDAG->getMachineNode(RdhwrOpc, Node->getDebugLoc(),
+                             Node->getValueType(0),
+                             CurDAG->getRegister(SrcReg, PtrVT));
+    SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
+                                         SDValue(Rdhwr, 0));
+    SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
+    ReplaceUses(SDValue(Node, 0), ResNode);
+    return std::make_pair(true, ResNode.getNode());
+  }
+  }
+
+  return std::make_pair(false, (SDNode*)NULL);
+}
+
+FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM) {
+  return new MipsSEDAGToDAGISel(TM);
+}
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
new file mode 100644
index 0000000..6137ab0
--- /dev/null
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -0,0 +1,57 @@
+//===-- MipsSEISelDAGToDAG.h - A Dag to Dag Inst Selector for MipsSE -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEISELDAGTODAG_H
+#define MIPSSEISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
+
+public:
+  explicit MipsSEDAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+
+private:
+  bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
+
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc dl,
+                                         EVT Ty, bool HasLo, bool HasHi);
+
+  SDNode *selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+                         DebugLoc DL, SDNode *Node) const;
+
+  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
+                             SDValue &Offset) const;
+
+  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+
+  virtual void processFunctionAfterISel(MachineFunction &MF);
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+};
+
+FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
new file mode 100644
index 0000000..287e2ed
--- /dev/null
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -0,0 +1,197 @@
+//===-- MipsSEISelLowering.cpp - MipsSE DAG Lowering Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsSEISelLowering.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
+                    cl::desc("MIPS: Enable tail calls."), cl::init(false));
+
+MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
+  : MipsTargetLowering(TM) {
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
+
+  if (HasMips64)
+    addRegisterClass(MVT::i64, &Mips::CPU64RegsRegClass);
+
+  if (Subtarget->hasDSP()) {
+    MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
+
+    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
+      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
+
+      // Expand all builtin opcodes.
+      for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+        setOperationAction(Opc, VecTys[i], Expand);
+
+      setOperationAction(ISD::LOAD, VecTys[i], Legal);
+      setOperationAction(ISD::STORE, VecTys[i], Legal);
+      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+    }
+  }
+
+  if (!TM.Options.UseSoftFloat) {
+    addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
+
+    // When dealing with single precision only, use libcalls
+    if (!Subtarget->isSingleFloat()) {
+      if (HasMips64)
+        addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
+      else
+        addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
+    }
+  }
+
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+
+  computeRegisterProperties();
+}
+
+const MipsTargetLowering *
+llvm::createMipsSETargetLowering(MipsTargetMachine &TM) {
+  return new MipsSETargetLowering(TM);
+}
+
+
+bool
+MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
+
+  switch (SVT) {
+  case MVT::i64:
+  case MVT::i32:
+    if (Fast)
+      *Fast = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
+MachineBasicBlock *
+MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::BPOSGE32_PSEUDO:
+    return emitBPOSGE32(MI, BB);
+  }
+}
+
+bool MipsSETargetLowering::
+isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                  unsigned NextStackOffset,
+                                  const MipsFunctionInfo& FI) const {
+  if (!EnableMipsTailCalls)
+    return false;
+
+  // Return false if either the callee or caller has a byval argument.
+  if (MipsCCInfo.hasByValArg() || FI.hasByvalArg())
+    return false;
+
+  // Return true if the callee's argument area is no larger than the
+  // caller's.
+  return NextStackOffset <= FI.getIncomingArgSize();
+}
+
+void MipsSETargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  // T9 should contain the address of the callee function if
+  // -reloction-model=pic or it is an indirect call.
+  if (IsPICCall || !GlobalOrExternal) {
+    unsigned T9Reg = IsN64 ? Mips::T9_64 : Mips::T9;
+    RegsToPass.push_front(std::make_pair(T9Reg, Callee));
+  } else
+    Ops.push_back(Callee);
+
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, CLI, Callee, Chain);
+}
+
+MachineBasicBlock * MipsSETargetLowering::
+emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
+  // $bb:
+  //  bposge32_pseudo $vr0
+  //  =>
+  // $bb:
+  //  bposge32 $tbb
+  // $fbb:
+  //  li $vr2, 0
+  //  b $sink
+  // $tbb:
+  //  li $vr1, 1
+  // $sink:
+  //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::CPURegsRegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bposge32 instruction to $BB.
+  BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned VR2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned VR1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(VR2).addMBB(FBB).addReg(VR1).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
new file mode 100644
index 0000000..04a28ce
--- /dev/null
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -0,0 +1,46 @@
+//===-- MipsSEISelLowering.h - MipsSE DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MipsSEISELLOWERING_H
+#define MipsSEISELLOWERING_H
+
+#include "MipsISelLowering.h"
+
+namespace llvm {
+  class MipsSETargetLowering : public MipsTargetLowering  {
+  public:
+    explicit MipsSETargetLowering(MipsTargetMachine &TM);
+
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+
+    virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  private:
+    virtual bool
+    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo& FI) const;
+
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+
+    MachineBasicBlock *emitBPOSGE32(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const;
+  };
+}
+
+#endif // MipsSEISELLOWERING_H
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 75b4c98..e11e5d1 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -33,7 +33,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasSwap(false),
   HasBitCount(false), HasFPIdx(false),
   InMips16Mode(false), InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-  IsAndroid(false), RM(_RM)
+  RM(_RM)
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 32baa3d..7a2e47c 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -95,9 +95,6 @@ protected:
   // HasDSP, HasDSPR2 -- supports DSP ASE.
   bool HasDSP, HasDSPR2;
 
-  // IsAndroid -- target is android
-  bool IsAndroid;
-
   InstrItineraryData InstrItins;
 
   // The instance to the register info section object
@@ -144,7 +141,6 @@ public:
   bool inMicroMipsMode() const { return InMicroMipsMode; }
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
-  bool isAndroid() const { return IsAndroid; }
   bool isLinux() const { return IsLinux; }
   bool useSmallSection() const { return UseSmallSection; }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 1b91e8b..3336358 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -54,7 +54,7 @@ MipsTargetMachine(const Target &T, StringRef TT,
                 "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32-S64")),
     InstrInfo(MipsInstrInfo::create(*this)),
     FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
-    TLInfo(*this), TSInfo(*this), JITInfo() {
+    TLInfo(MipsTargetLowering::create(*this)), TSInfo(*this), JITInfo() {
 }
 
 void MipsebTargetMachine::anchor() { }
@@ -116,6 +116,8 @@ bool MipsPassConfig::addPreEmitPass() {
   // NOTE: long branch has not been implemented for mips16.
   if (TM.getSubtarget<MipsSubtarget>().hasStandardEncoding())
     addPass(createMipsLongBranchPass(TM));
+  if (TM.getSubtarget<MipsSubtarget>().inMips16Mode())
+    addPass(createMipsConstantIslandPass(TM));
 
   return true;
 }
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index c4928c2..7e5f192 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -34,7 +34,7 @@ class MipsTargetMachine : public LLVMTargetMachine {
   const DataLayout    DL; // Calculates type size & alignment
   OwningPtr<const MipsInstrInfo> InstrInfo;
   OwningPtr<const MipsFrameLowering> FrameLowering;
-  MipsTargetLowering  TLInfo;
+  OwningPtr<const MipsTargetLowering> TLInfo;
   MipsSelectionDAGInfo TSInfo;
   MipsJITInfo JITInfo;
 
@@ -63,7 +63,7 @@ public:
   }
 
   virtual const MipsTargetLowering *getTargetLowering() const {
-    return &TLInfo;
+    return TLInfo.get();
   }
 
   virtual const MipsSelectionDAGInfo* getSelectionDAGInfo() const {
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5ee747a..e9a9fbf 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -101,7 +101,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
 
   // Operations not directly supported by NVPTX.
   setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC,             MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i1,  Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i8,  Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i16, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i64, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -716,16 +722,15 @@ NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       for (unsigned i=0,e=Ins.size(); i!=e; ++i) {
         unsigned sz = Ins[i].VT.getSizeInBits();
         if (Ins[i].VT.isInteger() && (sz < 8)) sz = 8;
-        std::vector<EVT> LoadRetVTs;
-        LoadRetVTs.push_back(Ins[i].VT);
-        LoadRetVTs.push_back(MVT::Other); LoadRetVTs.push_back(MVT::Glue);
-        std::vector<SDValue> LoadRetOps;
-        LoadRetOps.push_back(Chain);
-        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
-        LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32));
-        LoadRetOps.push_back(InFlag);
+        EVT LoadRetVTs[] = { Ins[i].VT, MVT::Other, MVT::Glue };
+        SDValue LoadRetOps[] = {
+          Chain,
+          DAG.getConstant(1, MVT::i32),
+          DAG.getConstant(resoffset, MVT::i32),
+          InFlag
+        };
         SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, LoadRetVTs,
-                                     &LoadRetOps[0], LoadRetOps.size());
+                                     LoadRetOps, array_lengthof(LoadRetOps));
         Chain = retval.getValue(1);
         InFlag = retval.getValue(2);
         InVals.push_back(retval);
@@ -750,16 +755,15 @@ NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         }
         std::vector<SDValue> tempRetVals;
         for (unsigned j=0; j<numelems; ++j) {
-          std::vector<EVT> MoveRetVTs;
-          MoveRetVTs.push_back(elemtype);
-          MoveRetVTs.push_back(MVT::Other); MoveRetVTs.push_back(MVT::Glue);
-          std::vector<SDValue> MoveRetOps;
-          MoveRetOps.push_back(Chain);
-          MoveRetOps.push_back(DAG.getConstant(0, MVT::i32));
-          MoveRetOps.push_back(DAG.getConstant(paramNum, MVT::i32));
-          MoveRetOps.push_back(InFlag);
+          EVT MoveRetVTs[] = { elemtype, MVT::Other, MVT::Glue };
+          SDValue MoveRetOps[] = {
+            Chain,
+            DAG.getConstant(0, MVT::i32),
+            DAG.getConstant(paramNum, MVT::i32),
+            InFlag
+          };
           SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, MoveRetVTs,
-                                       &MoveRetOps[0], MoveRetOps.size());
+                                       MoveRetOps, array_lengthof(MoveRetOps));
           Chain = retval.getValue(1);
           InFlag = retval.getValue(2);
           tempRetVals.push_back(retval);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 95e7b55..14afc14 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -136,7 +136,7 @@ public:
   NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
-  virtual MVT getShiftAmountTy(EVT LHSTy) const {
+  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const {
     return MVT::i32;
   }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 7917f77..709daa4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -12,6 +12,8 @@
 
 #include "llvm/MC/MCFixup.h"
 
+#undef PPC
+
 namespace llvm {
 namespace PPC {
 enum Fixups {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 4a42092..38a7420 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -47,6 +47,10 @@ MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
                                          uint8_t OSABI);
 } // End llvm namespace
 
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
 // Defines symbolic names for PowerPC registers.  This defines a mapping from
 // register name to register number.
 //
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 972e138..b0680fb 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -17,6 +17,10 @@
 // GCC #defines PPC on Linux but we use it as our namespace name
 #undef PPC
 
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
 namespace llvm {
 namespace PPC {
   /// Predicate - These are "(BI << 5) | BO"  for various predicates.
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index b98cc48..ecece8c 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -189,12 +189,23 @@ INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
 
 /// isCompareEquals - Returns true if the instruction is a compare equals
 /// instruction with an immediate operand.
-static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp) {
-  if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPDI) {
+static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp,
+                               bool &Int64Cmp) {
+  if (MI->getOpcode() == PPC::CMPWI) {
     SignedCmp = true;
+    Int64Cmp = false;
     return true;
-  } else if (MI->getOpcode() == PPC::CMPLWI || MI->getOpcode() == PPC::CMPLDI) {
+  } else if (MI->getOpcode() == PPC::CMPDI) {
+    SignedCmp = true;
+    Int64Cmp = true;
+    return true;
+  } else if (MI->getOpcode() == PPC::CMPLWI) {
+    SignedCmp = false;
+    Int64Cmp = false;
+    return true;
+  } else if (MI->getOpcode() == PPC::CMPLDI) {
     SignedCmp = false;
+    Int64Cmp = true;
     return true;
   }
 
@@ -353,9 +364,9 @@ CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
          RI = MRI->reg_begin(IV_Opnd->getReg()), RE = MRI->reg_end();
          RI != RE; ++RI) {
       IV_Opnd = &RI.getOperand();
-      bool SignedCmp;
+      bool SignedCmp, Int64Cmp;
       MachineInstr *MI = IV_Opnd->getParent();
-      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp) &&
+      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp, Int64Cmp) &&
           MI->getOperand(0).getReg() == PredReg) {
 
         OldInsts.push_back(MI);
@@ -380,14 +391,14 @@ CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
         assert(InitialValue->isReg() && "Expecting register for init value");
         unsigned InitialValueReg = InitialValue->getReg();
   
-        const MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
+        MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
   
         // Here we need to look for an immediate load (an li or lis/ori pair).
         if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
                          DefInstr->getOpcode() == PPC::ORI)) {
           int64_t start = (short) DefInstr->getOperand(2).getImm();
-          const MachineInstr *DefInstr2 =
-            MRI->getVRegDef(DefInstr->getOperand(0).getReg());
+          MachineInstr *DefInstr2 =
+            MRI->getVRegDef(DefInstr->getOperand(1).getReg());
           if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
                             DefInstr2->getOpcode() == PPC::LIS)) {
             DEBUG(dbgs() << "  initial constant: " << *DefInstr);
@@ -399,17 +410,33 @@ CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
             if ((count % iv_value) != 0) {
               return 0;
             }
-            return new CountValue(count/iv_value);
+
+            OldInsts.push_back(DefInstr);
+            OldInsts.push_back(DefInstr2);
+
+            // count/iv_value, the trip count, should be positive here. If it
+            // is negative, that indicates that the counter will wrap.
+            if (Int64Cmp)
+              return new CountValue(count/iv_value);
+            else
+              return new CountValue(uint32_t(count/iv_value));
           }
         } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
                                 DefInstr->getOpcode() == PPC::LI)) {
           DEBUG(dbgs() << "  initial constant: " << *DefInstr);
 
-          int64_t count = ImmVal - int64_t(short(DefInstr->getOperand(1).getImm()));
+          int64_t count = ImmVal -
+            int64_t(short(DefInstr->getOperand(1).getImm()));
           if ((count % iv_value) != 0) {
             return 0;
           }
-          return new CountValue(count/iv_value);
+
+          OldInsts.push_back(DefInstr);
+
+          if (Int64Cmp)
+            return new CountValue(count/iv_value);
+          else
+            return new CountValue(uint32_t(count/iv_value));
         } else if (iv_value == 1 || iv_value == -1) {
           // We can't determine a constant starting value.
           if (ImmVal == 0) {
@@ -417,8 +444,8 @@ CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
           }
           // FIXME: handle non-zero end value.
         }
-        // FIXME: handle non-unit increments (we might not want to introduce division
-        // but we can handle some 2^n cases with shifts).
+        // FIXME: handle non-unit increments (we might not want to introduce
+        // division but we can handle some 2^n cases with shifts).
   
       }
     }
@@ -489,9 +516,10 @@ bool PPCCTRLoops::isDead(const MachineInstr *MI,
     if (MO.isReg() && MO.isDef()) {
       unsigned Reg = MO.getReg();
       if (!MRI->use_nodbg_empty(Reg)) {
-        // This instruction has users, but if the only user is the phi node for the
-        // parent block, and the only use of that phi node is this instruction, then
-        // this instruction is dead: both it (and the phi node) can be removed.
+        // This instruction has users, but if the only user is the phi node for
+        // the parent block, and the only use of that phi node is this
+        // instruction, then this instruction is dead: both it (and the phi
+        // node) can be removed.
         MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg);
         if (llvm::next(I) == MRI->use_end() &&
             I.getOperand().getParent()->isPHI()) {
@@ -594,6 +622,16 @@ bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
     DEBUG(dbgs() << "failed to get trip count!\n");
     return false;
   }
+
+  if (TripCount->isImm()) {
+    DEBUG(dbgs() << "constant trip count: " << TripCount->getImm() << "\n");
+
+    // FIXME: We currently can't form 64-bit constants
+    // (including 32-bit unsigned constants)
+    if (!isInt<32>(TripCount->getImm()))
+      return false;
+  }
+
   // Does the loop contain any invalid instructions?
   if (containsInvalidInstruction(L)) {
     return false;
@@ -664,13 +702,14 @@ bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
     // Put the trip count in a register for transfer into the count register.
 
     int64_t CountImm = TripCount->getImm();
-    assert(!TripCount->isNeg() && "Constant trip count must be positive");
+    if (TripCount->isNeg())
+      CountImm = -CountImm;
 
     CountReg = MF->getRegInfo().createVirtualRegister(RC);
-    if (CountImm > 0xFFFF) {
+    if (abs64(CountImm) > 0x7FFF) {
       BuildMI(*Preheader, InsertPos, dl,
               TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS),
-              CountReg).addImm(CountImm >> 16);
+              CountReg).addImm((CountImm >> 16) & 0xFFFF);
       unsigned CountReg1 = CountReg;
       CountReg = MF->getRegInfo().createVirtualRegister(RC);
       BuildMI(*Preheader, InsertPos, dl,
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 0a396e6..353560d 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -188,13 +188,26 @@ static bool spillsCR(const MachineFunction &MF) {
   return FuncInfo->isCRSpilled();
 }
 
+static bool hasSpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasSpills();
+}
+
+static bool hasNonRISpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasNonRISpills();
+}
+
 /// determineFrameLayout - Determine the size of the frame and maximum call
 /// frame size.
-void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
+                                                bool UpdateMF,
+                                                bool UseEstimate) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Get the number of bytes to allocate from the FrameInfo
-  unsigned FrameSize = MFI->getStackSize();
+  unsigned FrameSize =
+    UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize();
 
   // Get the alignments provided by the target, and the maximum alignment
   // (if any) of the fixed frame objects.
@@ -223,8 +236,9 @@ void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 	&& spillsCR(MF)) &&
       (!ALIGN_STACK || MaxAlign <= TargetAlign)) { // No special alignment.
     // No need for frame
-    MFI->setStackSize(0);
-    return;
+    if (UpdateMF)
+      MFI->setStackSize(0);
+    return 0;
   }
 
   // Get the maximum call frame size of all the calls.
@@ -241,7 +255,8 @@ void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
     maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
 
   // Update maximum call frame size.
-  MFI->setMaxCallFrameSize(maxCallFrameSize);
+  if (UpdateMF)
+    MFI->setMaxCallFrameSize(maxCallFrameSize);
 
   // Include call frame size in total.
   FrameSize += maxCallFrameSize;
@@ -250,7 +265,10 @@ void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   FrameSize = (FrameSize + AlignMask) & ~AlignMask;
 
   // Update frame info.
-  MFI->setStackSize(FrameSize);
+  if (UpdateMF)
+    MFI->setStackSize(FrameSize);
+
+  return FrameSize;
 }
 
 // hasFP - Return true if the specified function actually has a dedicated frame
@@ -311,11 +329,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   MBBI = MBB.begin();
 
   // Work out frame sizes.
-  // FIXME: determineFrameLayout() may change the frame size. This should be
-  // moved upper, to some hook.
-  determineFrameLayout(MF);
-  unsigned FrameSize = MFI->getStackSize();
-
+  unsigned FrameSize = determineFrameLayout(MF);
   int NegFrameSize = -FrameSize;
 
   // Get processor type.
@@ -780,7 +794,7 @@ static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
 
 void
 PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                   RegScavenger *RS) const {
+                                                   RegScavenger *) const {
   const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
 
   //  Save and clear the LR state.
@@ -822,30 +836,15 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
     FI->setCRSpillFrameIndex(FrameIdx);
   }
-
-  // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
-  // a large stack, which will require scavenging a register to materialize a
-  // large offset.
-  // FIXME: this doesn't actually check stack size, so is a bit pessimistic
-  // FIXME: doesn't detect whether or not we need to spill vXX, which requires
-  //        r0 for now.
-
-  if (RegInfo->requiresRegisterScavenging(MF))
-    if (needsFP(MF) || spillsCR(MF)) {
-      const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-      const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-      const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
-      RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
-                                                         RC->getAlignment(),
-                                                         false));
-    }
 }
 
-void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
-                                                                        const {
+void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
   // Early exit if not using the SVR4 ABI.
-  if (!Subtarget.isSVR4ABI())
+  if (!Subtarget.isSVR4ABI()) {
+    addScavengingSpillSlot(MF, RS);
     return;
+  }
 
   // Get callee saved register information.
   MachineFrameInfo *FFI = MF.getFrameInfo();
@@ -853,6 +852,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
 
   // Early exit if no callee saved registers are modified!
   if (CSI.empty() && !needsFP(MF)) {
+    addScavengingSpillSlot(MF, RS);
     return;
   }
 
@@ -1031,6 +1031,37 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
     }
   }
+
+  addScavengingSpillSlot(MF, RS);
+}
+
+void
+PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
+                                         RegScavenger *RS) const {
+  // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
+  // a large stack, which will require scavenging a register to materialize a
+  // large offset.
+
+  // We need to have a scavenger spill slot for spills if the frame size is
+  // large. In case there is no free register for large-offset addressing,
+  // this slot is used for the necessary emergency spill. Also, we need the
+  // slot for dynamic stack allocations.
+
+  // The scavenger might be invoked if the frame offset does not fit into
+  // the 16-bit immediate. We don't know the complete frame size here
+  // because we've not yet computed callee-saved register spills or the
+  // needed alignment padding.
+  unsigned StackSize = determineFrameLayout(MF, false, true);
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MFI->hasVarSizedObjects() || spillsCR(MF) || hasNonRISpills(MF) ||
+      (hasSpills(MF) && !isInt<16>(StackSize))) {
+    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+    const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
+    RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  }
 }
 
 bool 
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index d09e47f..53ee326 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -32,7 +32,9 @@ public:
       Subtarget(sti) {
   }
 
-  void determineFrameLayout(MachineFunction &MF) const;
+  unsigned determineFrameLayout(MachineFunction &MF,
+                                bool UpdateMF = true,
+                                bool UseEstimate = false) const;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -44,7 +46,9 @@ public:
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
+  void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
@@ -144,6 +148,9 @@ public:
       return 0;
     }
 
+    // Note that the offsets here overlap, but this is fixed up in
+    // processFunctionBeforeFrameFinalized.
+
     static const SpillSlot Offsets[] = {
       // Floating-point register save area offsets.
       {PPC::F31, -8},
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index cf1f459..741e25e 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -57,6 +57,9 @@ cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
 cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
 
+static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
+cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
+
 static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
   if (TM.getSubtargetImpl()->isDarwin())
     return new TargetLoweringObjectFileMachO();
@@ -1180,13 +1183,15 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   SDValue Ptr;
   EVT VT;
+  unsigned Alignment;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
-
+    Alignment = LD->getAlignment();
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT  = ST->getMemoryVT();
+    Alignment = ST->getAlignment();
   } else
     return false;
 
@@ -1205,6 +1210,10 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
     if (!SelectAddressRegImm(Ptr, Offset, Base, DAG))
       return false;
   } else {
+    // LDU/STU need an address with at least 4-byte alignment.
+    if (Alignment < 4)
+      return false;
+
     // reg + imm * 4.
     if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG))
       return false;
@@ -4786,12 +4795,13 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   EVT VT = Op.getValueType();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  std::vector<EVT> NodeTys;
   SDValue MFFSreg, InFlag;
 
   // Save FP Control Word to register
-  NodeTys.push_back(MVT::f64);    // return register
-  NodeTys.push_back(MVT::Glue);   // unused in this context
+  EVT NodeTys[] = {
+    MVT::f64,    // return register
+    MVT::Glue    // unused in this context
+  };
   SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
 
   // Save FP register to stack slot
@@ -5408,9 +5418,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     Op.getOperand(3),  // RHS
     DAG.getConstant(CompareOpc, MVT::i32)
   };
-  std::vector<EVT> VTs;
-  VTs.push_back(Op.getOperand(2).getValueType());
-  VTs.push_back(MVT::Glue);
+  EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
   SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
 
   // Now that we have the comparison, emit a copy from the CR to a GPR.
@@ -6466,14 +6474,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
 
       // Create the PPCISD altivec 'dot' comparison node.
-      std::vector<EVT> VTs;
       SDValue Ops[] = {
         LHS.getOperand(2),  // LHS of compare
         LHS.getOperand(3),  // RHS of compare
         DAG.getConstant(CompareOpc, MVT::i32)
       };
-      VTs.push_back(LHS.getOperand(2).getValueType());
-      VTs.push_back(MVT::Glue);
+      EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
       SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
 
       // Unpack the result based on how the target uses it.
@@ -6854,6 +6860,32 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
   }
 }
 
+bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                      bool *Fast) const {
+  if (DisablePPCUnaligned)
+    return false;
+
+  // PowerPC supports unaligned memory access for simple non-vector types.
+  // Although accessing unaligned addresses is not as efficient as accessing
+  // aligned addresses, it is generally more efficient than manual expansion,
+  // and generally only traps for software emulation when crossing page
+  // boundaries.
+
+  if (!VT.isSimple())
+    return false;
+
+  if (VT.getSimpleVT().isVector())
+    return false;
+
+  if (VT == MVT::ppcf128)
+    return false;
+
+  if (Fast)
+    *Fast = true;
+
+  return true;
+}
+
 /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
 /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
 /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index f5d418c..8d44d9f 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -329,7 +329,7 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     virtual EVT getSetCCResultType(EVT VT) const;
@@ -449,6 +449,10 @@ namespace llvm {
                         bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
+    /// Is unaligned memory access allowed for the given type, and is it fast
+    /// relative to software emulation.
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast = 0) const;
+
     /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
     /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
     /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 0120130..bca1bd5 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -555,7 +555,8 @@ def LHA8: DForm_1<42, (outs G8RC:$rD), (ins memri:$src),
                   PPC970_DGroup_Cracked;
 def LWA  : DSForm_1<58, 2, (outs G8RC:$rD), (ins memrix:$src),
                     "lwa $rD, $src", LdStLWA,
-                    [(set G8RC:$rD, (sextloadi32 ixaddr:$src))]>, isPPC64,
+                    [(set G8RC:$rD,
+                          (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
                     PPC970_DGroup_Cracked;
 def LHAX8: XForm_1<31, 343, (outs G8RC:$rD), (ins memrr:$src),
                    "lhax $rD, $src", LdStLHA,
@@ -648,7 +649,7 @@ def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result),
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LD   : DSForm_1<58, 0, (outs G8RC:$rD), (ins memrix:$src),
                     "ld $rD, $src", LdStLD,
-                    [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64;
+                    [(set G8RC:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
 def LDrs : DSForm_1<58, 0, (outs G8RC:$rD), (ins memrs:$src),
                     "ld $rD, $src", LdStLD,
                     []>, isPPC64;
@@ -682,6 +683,10 @@ def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
 def LDX  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
                    "ldx $rD, $src", LdStLD,
                    [(set G8RC:$rD, (load xaddr:$src))]>, isPPC64;
+let isCodeGenOnly = 1 in
+def LDXu  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
+                    "ldx $rD, $src", LdStLD,
+                    [(set G8RC:$rD, (load xaddr:$src))]>, isPPC64;
                    
 let mayLoad = 1 in
 def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr),
@@ -798,7 +803,7 @@ def STWX8 : XForm_8<31, 151, (outs), (ins G8RC:$rS, memrr:$dst),
 // Normal 8-byte stores.
 def STD  : DSForm_1<62, 0, (outs), (ins G8RC:$rS, memrix:$dst),
                     "std $rS, $dst", LdStSTD,
-                    [(store G8RC:$rS, ixaddr:$dst)]>, isPPC64;
+                    [(aligned4store G8RC:$rS, ixaddr:$dst)]>, isPPC64;
 def STDX  : XForm_8<31, 149, (outs), (ins G8RC:$rS, memrr:$dst),
                    "stdx $rS, $dst", LdStSTD,
                    [(store G8RC:$rS, xaddr:$dst)]>, isPPC64,
@@ -833,8 +838,9 @@ def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
 def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         s16immX4:$ptroff, ptr_rc:$ptrreg),
                     "stdu $rS, $ptroff($ptrreg)", LdStSTDU,
-                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
-                                                     iaddroff:$ptroff))]>,
+                    [(set ptr_rc:$ea_res,
+                       (aligned4pre_store G8RC:$rS, ptr_rc:$ptrreg, 
+                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     isPPC64;
 
@@ -979,3 +985,13 @@ def : Pat<(add G8RC:$in, (PPChi tjumptable:$g, 0)),
           (ADDIS8 G8RC:$in, tjumptable:$g)>;
 def : Pat<(add G8RC:$in, (PPChi tblockaddress:$g, 0)),
           (ADDIS8 G8RC:$in, tblockaddress:$g)>;
+
+// Patterns to match r+r indexed loads and stores for
+// addresses without at least 4-byte alignment.
+def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
+          (LWAX xoaddr:$src)>;
+def : Pat<(i64 (unaligned4load xoaddr:$src)),
+          (LDX xoaddr:$src)>;
+def : Pat<(unaligned4store G8RC:$rS, xoaddr:$dst),
+          (STDX G8RC:$rS, xoaddr:$dst)>;
+
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 0cf28ae..0ed7ff2 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -182,6 +182,9 @@ class VX2_Int<bits<11> xo, string opc, Intrinsic IntID>
 //===----------------------------------------------------------------------===//
 // Instruction Definitions.
 
+def HasAltivec : Predicate<"PPCSubTarget.hasAltivec()">;
+let Predicates = [HasAltivec] in {
+
 def DSS      : DSS_Form<822, (outs),
                         (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
                         "dss $STRM", LdStLoad /*FIXME*/, []>;
@@ -733,3 +736,6 @@ def : Pat<(v4f32 (ftrunc (v4f32 VRRC:$vA))),
           (VRFIZ VRRC:$vA)>;
 def : Pat<(v4f32 (fnearbyint (v4f32 VRRC:$vA))),
           (VRFIN VRRC:$vA)>;
+
+} // end HasAltivec
+
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index a0517a8..7fe7880 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -33,11 +33,6 @@
 #define GET_INSTRINFO_CTOR
 #include "PPCGenInstrInfo.inc"
 
-namespace llvm {
-extern cl::opt<bool> DisablePPC32RS;
-extern cl::opt<bool> DisablePPC64RS;
-}
-
 using namespace llvm;
 
 static cl::
@@ -444,7 +439,8 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                   unsigned SrcReg, bool isKill,
                                   int FrameIdx,
                                   const TargetRegisterClass *RC,
-                                  SmallVectorImpl<MachineInstr*> &NewMIs) const{
+                                  SmallVectorImpl<MachineInstr*> &NewMIs,
+                                  bool &NonRI) const{
   DebugLoc DL;
   if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR) {
@@ -489,47 +485,11 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                getKillRegState(isKill)),
                                        FrameIdx));
   } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
-    if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
-        (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
-      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
-                                         .addReg(SrcReg,
-                                                 getKillRegState(isKill)),
-                                         FrameIdx));
-      return true;
-    } else {
-      // FIXME: We need a scatch reg here.  The trouble with using R0 is that
-      // it's possible for the stack frame to be so big the save location is
-      // out of range of immediate offsets, necessitating another register.
-      // We hack this on Darwin by reserving R2.  It's probably broken on Linux
-      // at the moment.
-
-      bool is64Bit = TM.getSubtargetImpl()->isPPC64();
-      // We need to store the CR in the low 4-bits of the saved value.  First,
-      // issue a MFCR to save all of the CRBits.
-      unsigned ScratchReg = TM.getSubtargetImpl()->isDarwinABI() ?
-                              (is64Bit ? PPC::X2 : PPC::R2) :
-                              (is64Bit ? PPC::X0 : PPC::R0);
-      NewMIs.push_back(BuildMI(MF, DL, get(is64Bit ? PPC::MFCR8pseud :
-                                             PPC::MFCRpseud), ScratchReg)
-                               .addReg(SrcReg, getKillRegState(isKill)));
-
-      // If the saved register wasn't CR0, shift the bits left so that they are
-      // in CR0's slot.
-      if (SrcReg != PPC::CR0) {
-        unsigned ShiftBits = getPPCRegisterNumbering(SrcReg)*4;
-        // rlwinm scratch, scratch, ShiftBits, 0, 31.
-        NewMIs.push_back(BuildMI(MF, DL, get(is64Bit ? PPC::RLWINM8 :
-                           PPC::RLWINM), ScratchReg)
-                       .addReg(ScratchReg).addImm(ShiftBits)
-                       .addImm(0).addImm(31));
-      }
-
-      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(is64Bit ?
-                                           PPC::STW8 : PPC::STW))
-                                         .addReg(ScratchReg,
-                                                 getKillRegState(isKill)),
-                                         FrameIdx));
-    }
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    return true;
   } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
     // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
     // backend currently only uses CR1EQ as an individual bit, this should
@@ -562,23 +522,14 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
       Reg = PPC::CR7;
 
     return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx,
-                               &PPC::CRRCRegClass, NewMIs);
+                               &PPC::CRRCRegClass, NewMIs, NonRI);
 
   } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
-    // We don't have indexed addressing for vector loads.  Emit:
-    // R0 = ADDI FI#
-    // STVX VAL, 0, R0
-    //
-    // FIXME: We use R0 here, because it isn't available for RA.
-    bool Is64Bit = TM.getSubtargetImpl()->isPPC64();
-    unsigned Instr = Is64Bit ? PPC::ADDI8 : PPC::ADDI;
-    unsigned GPR0  = Is64Bit ? PPC::X0    : PPC::R0;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Instr), GPR0),
-                                       FrameIdx, 0, 0));
-    NewMIs.push_back(BuildMI(MF, DL, get(PPC::STVX))
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addReg(GPR0)
-                     .addReg(GPR0));
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -595,10 +546,15 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr*, 4> NewMIs;
 
-  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs)) {
-    PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  bool NonRI = false;
+  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs, NonRI))
     FuncInfo->setSpillsCR();
-  }
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
     MBB.insert(MI, NewMIs[i]);
@@ -616,7 +572,8 @@ bool
 PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
-                                   SmallVectorImpl<MachineInstr*> &NewMIs)const{
+                                   SmallVectorImpl<MachineInstr*> &NewMIs,
+                                   bool &NonRI) const{
   if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
     if (DestReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
@@ -642,37 +599,10 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
                                        FrameIdx));
   } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
-    if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
-        (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
-      NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
-                                                 get(PPC::RESTORE_CR), DestReg)
-                                         , FrameIdx));
-      return true;
-    } else {
-      // FIXME: We need a scatch reg here.  The trouble with using R0 is that
-      // it's possible for the stack frame to be so big the save location is
-      // out of range of immediate offsets, necessitating another register.
-      // We hack this on Darwin by reserving R2.  It's probably broken on Linux
-      // at the moment.
-      unsigned ScratchReg = TM.getSubtargetImpl()->isDarwinABI() ?
-                                                            PPC::R2 : PPC::R0;
-      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
-                                         ScratchReg), FrameIdx));
-  
-      // If the reloaded register isn't CR0, shift the bits right so that they are
-      // in the right CR's slot.
-      if (DestReg != PPC::CR0) {
-        unsigned ShiftBits = getPPCRegisterNumbering(DestReg)*4;
-        // rlwinm r11, r11, 32-ShiftBits, 0, 31.
-        NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
-                      .addReg(ScratchReg).addImm(32-ShiftBits).addImm(0)
-                      .addImm(31));
-      }
-  
-      NewMIs.push_back(BuildMI(MF, DL, get(TM.getSubtargetImpl()->isPPC64() ?
-                         PPC::MTCRF8 : PPC::MTCRF), DestReg)
-                       .addReg(ScratchReg));
-    }
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_CR), DestReg),
+                                       FrameIdx));
+    return true;
   } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
 
     unsigned Reg = 0;
@@ -702,21 +632,12 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
       Reg = PPC::CR7;
 
     return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx,
-                                &PPC::CRRCRegClass, NewMIs);
+                                &PPC::CRRCRegClass, NewMIs, NonRI);
 
   } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
-    // We don't have indexed addressing for vector loads.  Emit:
-    // R0 = ADDI FI#
-    // Dest = LVX 0, R0
-    //
-    // FIXME: We use R0 here, because it isn't available for RA.
-    bool Is64Bit = TM.getSubtargetImpl()->isPPC64();
-    unsigned Instr = Is64Bit ? PPC::ADDI8 : PPC::ADDI;
-    unsigned GPR0  = Is64Bit ? PPC::X0    : PPC::R0;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Instr), GPR0),
-                                       FrameIdx, 0, 0));
-    NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(GPR0)
-                     .addReg(GPR0));
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -734,10 +655,17 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   SmallVector<MachineInstr*, 4> NewMIs;
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
-  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs)) {
-    PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  bool NonRI = false;
+  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs, NonRI))
     FuncInfo->setSpillsCR();
-  }
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
+
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
     MBB.insert(MI, NewMIs[i]);
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 374213e..5d4ae91 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -71,11 +71,13 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   bool StoreRegToStackSlot(MachineFunction &MF,
                            unsigned SrcReg, bool isKill, int FrameIdx,
                            const TargetRegisterClass *RC,
-                           SmallVectorImpl<MachineInstr*> &NewMIs) const;
+                           SmallVectorImpl<MachineInstr*> &NewMIs,
+                           bool &NonRI) const;
   bool LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                             unsigned DestReg, int FrameIdx,
                             const TargetRegisterClass *RC,
-                            SmallVectorImpl<MachineInstr*> &NewMIs) const;
+                            SmallVectorImpl<MachineInstr*> &NewMIs,
+                            bool &NonRI) const;
 public:
   explicit PPCInstrInfo(PPCTargetMachine &TM);
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 460e943..3f181aa 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -278,6 +278,38 @@ def imm16ShiftedSExt : PatLeaf<(imm), [{
   return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
 }], HI16>;
 
+// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
+// restricted memrix (offset/4) constants are alignment sensitive. If these
+// offsets are hidden behind TOC entries than the values of the lower-order
+// bits cannot be checked directly. As a result, we need to also incorporate
+// an alignment check into the relevant patterns.
+
+def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4store : PatFrag<(ops node:$val, node:$ptr),
+                            (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4pre_store : PatFrag<
+                          (ops node:$val, node:$base, node:$offset),
+                          (pre_store node:$val, node:$base, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Flag Definitions.
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 045b375..b1636a2 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -37,6 +37,13 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// PEI.
   bool MustSaveLR;
 
+  /// Does this function have any stack spills.
+  bool HasSpills;
+
+  /// Does this function spill using instructions with only r+r (not r+i)
+  /// forms.
+  bool HasNonRISpills;
+
   /// SpillsCR - Indicates whether CR is spilled in the current function.
   bool SpillsCR;
 
@@ -78,6 +85,8 @@ public:
   explicit PPCFunctionInfo(MachineFunction &MF) 
     : FramePointerSaveIndex(0),
       ReturnAddrSaveIndex(0),
+      HasSpills(false),
+      HasNonRISpills(false),
       SpillsCR(false),
       LRStoreRequired(false),
       MinReservedArea(0),
@@ -109,6 +118,12 @@ public:
   void setMustSaveLR(bool U) { MustSaveLR = U; }
   bool mustSaveLR() const    { return MustSaveLR; }
 
+  void setHasSpills()      { HasSpills = true; }
+  bool hasSpills() const   { return HasSpills; }
+
+  void setHasNonRISpills()    { HasNonRISpills = true; }
+  bool hasNonRISpills() const { return HasNonRISpills; }
+
   void setSpillsCR()       { SpillsCR = true; }
   bool isCRSpilled() const { return SpillsCR; }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index df245cc..e2c7221 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -46,26 +46,8 @@
 #define GET_REGINFO_TARGET_DESC
 #include "PPCGenRegisterInfo.inc"
 
-namespace llvm {
-cl::opt<bool> DisablePPC32RS("disable-ppc32-regscavenger",
-                                   cl::init(false),
-                                   cl::desc("Disable PPC32 register scavenger"),
-                                   cl::Hidden);
-cl::opt<bool> DisablePPC64RS("disable-ppc64-regscavenger",
-                                   cl::init(false),
-                                   cl::desc("Disable PPC64 register scavenger"),
-                                   cl::Hidden);
-}
-
 using namespace llvm;
 
-// FIXME (64-bit): Should be inlined.
-bool
-PPCRegisterInfo::requiresRegisterScavenging(const MachineFunction &) const {
-  return ((!DisablePPC32RS && !Subtarget.isPPC64()) ||
-          (!DisablePPC64RS && Subtarget.isPPC64()));
-}
-
 PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
                                  const TargetInstrInfo &tii)
   : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
@@ -89,12 +71,6 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
   ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; ImmToIdxMap[PPC::STD_32] = PPC::STDX_32;
 }
 
-bool
-PPCRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  return requiresRegisterScavenging(MF);
-}
-
-
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
 const TargetRegisterClass *
@@ -139,12 +115,6 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(PPC::R2);  // System-reserved register
     Reserved.set(PPC::R13); // Small Data Area pointer register
   }
-  // Reserve R2 on Darwin to hack around the problem of save/restore of CR
-  // when the stack frame is too big to address directly; we need two regs.
-  // This is a hack.
-  if (Subtarget.isDarwinABI()) {
-    Reserved.set(PPC::R2);
-  }
   
   // On PPC64, r13 is the thread pointer. Never allocate this register.
   // Note that this is over conservative, as it also prevents allocation of R31
@@ -162,12 +132,6 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     if (Subtarget.isSVR4ABI()) {
       Reserved.set(PPC::X2);
     }
-    // Reserve X2 on Darwin to hack around the problem of save/restore of CR
-    // when the stack frame is too big to address directly; we need two regs.
-    // This is a hack.
-    if (Subtarget.isDarwinABI()) {
-      Reserved.set(PPC::X2);
-    }
   }
 
   if (PPCFI->needsFP(MF))
@@ -199,38 +163,10 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
-bool
-PPCRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
-  switch (RC->getID()) {
-  case PPC::G8RCRegClassID:
-  case PPC::GPRCRegClassID:
-  case PPC::F8RCRegClassID:
-  case PPC::F4RCRegClassID:
-  case PPC::VRRCRegClassID:
-    return true;
-  default:
-    return false;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
-/// findScratchRegister - Find a 'free' PPC register. Try for a call-clobbered
-/// register first and then a spilled callee-saved register if that fails.
-static
-unsigned findScratchRegister(MachineBasicBlock::iterator II, RegScavenger *RS,
-                             const TargetRegisterClass *RC, int SPAdj) {
-  assert(RS && "Register scavenging must be on");
-  unsigned Reg = RS->FindUnusedReg(RC);
-  // FIXME: move ARM callee-saved reg scan to target independent code, then 
-  // search for already spilled CS register here.
-  if (Reg == 0)
-    Reg = RS->scavengeRegister(RC, II, SPAdj);
-  return Reg;
-}
-
 /// lowerDynamicAlloc - Generate the code for allocating an object in the
 /// current frame.  The sequence of code with be in the general form
 ///
@@ -271,28 +207,16 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
   // Fortunately, a frame greater than 32K is rare.
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  const TargetRegisterClass *RC = LP64 ? G8RC : GPRC;
-
-  // FIXME (64-bit): Use "findScratchRegister"
-  unsigned Reg;
-  if (requiresRegisterScavenging(MF))
-    Reg = findScratchRegister(II, RS, RC, SPAdj);
-  else
-    Reg = PPC::R0;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
   
   if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
     BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
       .addReg(PPC::R31)
       .addImm(FrameSize);
   } else if (LP64) {
-    if (requiresRegisterScavenging(MF)) // FIXME (64-bit): Use "true" part.
-      BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
-        .addImm(0)
-        .addReg(PPC::X1);
-    else
-      BuildMI(MBB, II, dl, TII.get(PPC::LD), PPC::X0)
-        .addImm(0)
-        .addReg(PPC::X1);
+    BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
+      .addImm(0)
+      .addReg(PPC::X1);
   } else {
     BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
       .addImm(0)
@@ -302,17 +226,10 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
   // Grow the stack and update the stack pointer link, then determine the
   // address of new allocated space.
   if (LP64) {
-    if (requiresRegisterScavenging(MF)) // FIXME (64-bit): Use "true" part.
-      BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(Reg, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(MI.getOperand(1).getReg());
-    else
-      BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(PPC::X0, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(MI.getOperand(1).getReg());
-
+    BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::X1)
+      .addReg(MI.getOperand(1).getReg());
     if (!MI.getOperand(1).isKill())
       BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
         .addReg(PPC::X1)
@@ -369,8 +286,7 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   (void) RS;
 
   bool LP64 = Subtarget.isPPC64();
-  unsigned Reg = Subtarget.isDarwinABI() ?  (LP64 ? PPC::X2 : PPC::R2) :
-                                            (LP64 ? PPC::X0 : PPC::R0);
+  unsigned Reg = LP64 ? PPC::X0 : PPC::R0;
   unsigned SrcReg = MI.getOperand(0).getReg();
 
   // We need to store the CR in the low 4-bits of the saved value. First, issue
@@ -412,8 +328,7 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   (void) RS;
 
   bool LP64 = Subtarget.isPPC64();
-  unsigned Reg = Subtarget.isDarwinABI() ?  (LP64 ? PPC::X2 : PPC::R2) :
-                                            (LP64 ? PPC::X0 : PPC::R0);
+  unsigned Reg = LP64 ? PPC::X0 : PPC::R0;
   unsigned DestReg = MI.getOperand(0).getReg();
   assert(MI.definesRegister(DestReg) &&
     "RESTORE_CR does not define its destination");
@@ -499,14 +414,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   // Special case for pseudo-ops SPILL_CR and RESTORE_CR.
-  if (requiresRegisterScavenging(MF)) {
-    if (OpC == PPC::SPILL_CR) {
-      lowerCRSpilling(II, FrameIndex, SPAdj, RS);
-      return;
-    } else if (OpC == PPC::RESTORE_CR) {
-      lowerCRRestore(II, FrameIndex, SPAdj, RS);
-      return;
-    }
+  if (OpC == PPC::SPILL_CR) {
+    lowerCRSpilling(II, FrameIndex, SPAdj, RS);
+    return;
+  } else if (OpC == PPC::RESTORE_CR) {
+    lowerCRRestore(II, FrameIndex, SPAdj, RS);
+    return;
   }
 
   // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
@@ -529,7 +442,25 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     isIXAddr = true;
     break;
   }
-  
+
+  bool noImmForm = false;
+  switch (OpC) {
+  case PPC::LVEBX:
+  case PPC::LVEHX:
+  case PPC::LVEWX:
+  case PPC::LVX:
+  case PPC::LVXL:
+  case PPC::LVSL:
+  case PPC::LVSR:
+  case PPC::STVEBX:
+  case PPC::STVEHX:
+  case PPC::STVEWX:
+  case PPC::STVX:
+  case PPC::STVXL:
+    noImmForm = true;
+    break;
+  }
+
   // Now add the frame object offset to the offset from r1.
   int Offset = MFI->getObjectOffset(FrameIndex);
   if (!isIXAddr)
@@ -553,7 +484,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // only "std" to a stack slot that is at least 4-byte aligned, but it can
   // happen in invalid code.
   if (OpC == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
-      (isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0))) {
+      (!noImmForm &&
+       isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0))) {
     if (isIXAddr)
       Offset >>= 2;    // The actual encoded value has the low two bits zero.
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
@@ -563,13 +495,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // The offset doesn't fit into a single register, scavenge one to build the
   // offset in.
 
-  unsigned SReg;
-  if (requiresRegisterScavenging(MF)) {
-    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-    SReg = findScratchRegister(II, RS, is64Bit ? G8RC : GPRC, SPAdj);
-  } else
-    SReg = is64Bit ? PPC::X0 : PPC::R0;
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned SReg = MF.getRegInfo().createVirtualRegister(is64Bit ? G8RC : GPRC);
 
   // Insert a set of rA with the full offset value before the ld, st, or add
   BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SReg)
@@ -584,7 +512,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   //   addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
   unsigned OperandBase;
 
-  if (OpC != TargetOpcode::INLINEASM) {
+  if (noImmForm)
+    OperandBase = 1;
+  else if (OpC != TargetOpcode::INLINEASM) {
     assert(ImmToIdxMap.count(OpC) &&
            "No indexed form of load or store available!");
     unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 9840666..5f89f63 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -47,13 +47,18 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
-  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const;
-
-  /// requiresRegisterScavenging - We require a register scavenger.
-  /// FIXME (64-bit): Should be inlined.
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
-
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+  /// We require the register scavenger.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+    return true;
+  }
 
   void lowerDynamicAlloc(MachineBasicBlock::iterator II,
                          int SPAdj, RegScavenger *RS) const;
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index ba87918..e099a9f 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -23,11 +23,9 @@ class AMDGPUTargetMachine;
 // R600 Passes
 FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
-FunctionPass *createR600LowerConstCopy(TargetMachine &tm);
 
 // SI Passes
 FunctionPass *createSIAnnotateControlFlowPass();
-FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 40f4741..1a26c77 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -38,3 +38,4 @@ include "AMDGPUInstrInfo.td"
 include "AMDGPUIntrinsics.td"
 include "AMDGPURegisterInfo.td"
 include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index c30dbe4..f600144 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -141,5 +141,5 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
   SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
   OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
   OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
-  OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4);
+  OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
 }
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
new file mode 100644
index 0000000..45ae37e
--- /dev/null
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -0,0 +1,42 @@
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
+  ]>>>,
+
+  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ]
+  >>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+  ]>>>
+
+]>;
+
+def CC_AMDGPU : CallingConv<[
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"#
+       "->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>>
+]>;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 0a33264..5995b6f 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -14,7 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUISelLowering.h"
+#include "AMDGPURegisterInfo.h"
 #include "AMDILIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -22,6 +25,8 @@
 
 using namespace llvm;
 
+#include "AMDGPUGenCallingConv.inc"
+
 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
@@ -64,17 +69,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 // TargetLowering Callbacks
 //===---------------------------------------------------------------------===//
 
-SDValue AMDGPUTargetLowering::LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
-  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
-    InVals.push_back(SDValue());
-  }
-  return Chain;
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
+
+  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
 }
 
 SDValue AMDGPUTargetLowering::LowerReturn(
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 9e7d997..f31b646 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -39,15 +39,12 @@ protected:
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
+  void AnalyzeFormalArguments(CCState &State,
+                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
 public:
   AMDGPUTargetLowering(TargetMachine &TM);
 
-  virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
-                             bool isVarArg,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc DL, SelectionDAG &DAG,
-                             SmallVectorImpl<SDValue> &InVals) const;
-
   virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
index 15840b3..ed6c8ec 100644
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
@@ -289,7 +289,6 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
 
           // We only need to use REG_SEQUENCE for explicit defs, since the
           // register coalescer won't do anything with the implicit defs.
-          MachineInstr *DefInstr = MRI.getVRegDef(Reg);
           if (!regHasExplicitDef(MRI, Reg)) {
             continue;
           }
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 960f108..e740348 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -132,13 +132,6 @@ class FNEG <RegisterClass rc> : AMDGPUShaderInst <
   [(set rc:$dst, (fneg rc:$src0))]
 >;
 
-def SHADER_TYPE : AMDGPUShaderInst <
-  (outs),
-  (ins i32imm:$type),
-  "SHADER_TYPE $type",
-  [(int_AMDGPU_shader_type imm:$type)]
->;
-
 } // usesCustomInserter = 1
 
 multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
@@ -209,8 +202,8 @@ class Vector2_Build <ValueType vecType, RegisterClass vectorClass,
   (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1)
 >;
 
-class Vector_Build <ValueType vecType, RegisterClass vectorClass,
-                    ValueType elemType, RegisterClass elemClass> : Pat <
+class Vector4_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
   (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
                          (elemType elemClass:$z), (elemType elemClass:$w))),
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index 2ba2d4b..eecb25b 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -50,8 +50,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
-  def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>;
 }
 
 let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
index 26f842e..b723433 100644
--- a/lib/Target/R600/AMDGPUStructurizeCFG.cpp
+++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
@@ -243,6 +243,7 @@ public:
     initializeRegionInfoPass(*PassRegistry::getPassRegistry());
   }
 
+  using Pass::doInitialization;
   virtual bool doInitialization(Region *R, RGPassManager &RGM);
 
   virtual bool runOnRegion(Region *R, RGPassManager &RGM);
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index e2f00be..0185747 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPU.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "llvm/Analysis/Passes.h"
@@ -39,6 +40,14 @@ extern "C" void LLVMInitializeR600Target() {
   RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
 }
 
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, new R600SchedStrategy());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("r600", "Run R600's custom scheduler",
+                    createR600MachineScheduler);
+
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
     StringRef CPU, StringRef FS,
   TargetOptions Options,
@@ -70,7 +79,13 @@ namespace {
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+    if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+      enablePass(&MachineSchedulerID);
+      MachineSchedRegistry::setDefault(createR600MachineScheduler);
+    }
+  }
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     return getTM<AMDGPUTargetMachine>();
@@ -112,11 +127,6 @@ bool AMDGPUPassConfig::addInstSelector() {
 }
 
 bool AMDGPUPassConfig::addPreRegAlloc() {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-
-  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
-    addPass(createSIAssignInterpRegsPass(*TM));
-  }
   addPass(createAMDGPUConvertToISAPass(*TM));
   return false;
 }
@@ -143,7 +153,6 @@ bool AMDGPUPassConfig::addPreEmitPass() {
     addPass(createAMDGPUCFGStructurizerPass(*TM));
     addPass(createR600ExpandSpecialInstrsPass(*TM));
     addPass(&FinalizeMachineBundlesID);
-    addPass(createR600LowerConstCopy(*TM));
   } else {
     addPass(createSILowerControlFlowPass(*TM));
   }
diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h
index b39fbdb..39ab664 100644
--- a/lib/Target/R600/AMDIL.h
+++ b/lib/Target/R600/AMDIL.h
@@ -96,24 +96,23 @@ enum AddressSpaces {
   ADDRESS_NONE     = 5, ///< Address space for unknown memory.
   PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
   PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
-  USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
-  CONSTANT_BUFFER_0 = 9,
-  CONSTANT_BUFFER_1 = 10,
-  CONSTANT_BUFFER_2 = 11,
-  CONSTANT_BUFFER_3 = 12,
-  CONSTANT_BUFFER_4 = 13,
-  CONSTANT_BUFFER_5 = 14,
-  CONSTANT_BUFFER_6 = 15,
-  CONSTANT_BUFFER_7 = 16,
-  CONSTANT_BUFFER_8 = 17,
-  CONSTANT_BUFFER_9 = 18,
-  CONSTANT_BUFFER_10 = 19,
-  CONSTANT_BUFFER_11 = 20,
-  CONSTANT_BUFFER_12 = 21,
-  CONSTANT_BUFFER_13 = 22,
-  CONSTANT_BUFFER_14 = 23,
-  CONSTANT_BUFFER_15 = 24,
-  LAST_ADDRESS     = 25
+  CONSTANT_BUFFER_0 = 8,
+  CONSTANT_BUFFER_1 = 9,
+  CONSTANT_BUFFER_2 = 10,
+  CONSTANT_BUFFER_3 = 11,
+  CONSTANT_BUFFER_4 = 12,
+  CONSTANT_BUFFER_5 = 13,
+  CONSTANT_BUFFER_6 = 14,
+  CONSTANT_BUFFER_7 = 15,
+  CONSTANT_BUFFER_8 = 16,
+  CONSTANT_BUFFER_9 = 17,
+  CONSTANT_BUFFER_10 = 18,
+  CONSTANT_BUFFER_11 = 19,
+  CONSTANT_BUFFER_12 = 20,
+  CONSTANT_BUFFER_13 = 21,
+  CONSTANT_BUFFER_14 = 22,
+  CONSTANT_BUFFER_15 = 23,
+  LAST_ADDRESS     = 24
 };
 
 } // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index aa8ab6b..b0cd0f9 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -2595,6 +2595,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
 
   static int getBranchNzeroOpcode(int oldOpcode) {
     switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
     case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
     case AMDGPU::BRANCH_COND_i32:
     case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
@@ -2606,6 +2607,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
 
   static int getBranchZeroOpcode(int oldOpcode) {
     switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
     case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
     case AMDGPU::BRANCH_COND_i32:
     case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
@@ -2617,6 +2619,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
 
   static int getContinueNzeroOpcode(int oldOpcode) {
     switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
     case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
     default:
       assert(0 && "internal error");
@@ -2626,6 +2629,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
 
   static int getContinueZeroOpcode(int oldOpcode) {
     switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
     case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
     default:
       assert(0 && "internal error");
@@ -2654,8 +2658,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
 
   static bool isCondBranch(MachineInstr *instr) {
     switch (instr->getOpcode()) {
-      case AMDGPU::JUMP:
-        return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0;
+      case AMDGPU::JUMP_COND:
       case AMDGPU::BRANCH_COND_i32:
       case AMDGPU::BRANCH_COND_f32:
       break;
@@ -2668,7 +2671,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
   static bool isUncondBranch(MachineInstr *instr) {
     switch (instr->getOpcode()) {
     case AMDGPU::JUMP:
-      return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0;
     case AMDGPU::BRANCH:
       return true;
     default:
diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp
index eec5059..db8e01e 100644
--- a/lib/Target/R600/AMDILDevice.cpp
+++ b/lib/Target/R600/AMDILDevice.cpp
@@ -115,10 +115,18 @@ bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
 
 std::string
 AMDGPUDevice::getDataLayout() const {
-    return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
-      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
-      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
-      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
-      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-      "-n8:16:32:64");
+  std::string DataLayout = std::string(
+   "e"
+   "-p:32:32:32"
+   "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+   "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
+   "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+   "-n32:64"
+  );
+
+  if (usesHardware(AMDGPUDeviceInfo::DoubleOps)) {
+    DataLayout.append("-f64:64:64");
+  }
+
+  return DataLayout;
 }
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index e77b9dc..fa8f62d 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -162,6 +162,35 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   switch (Opc) {
   default: break;
+  case ISD::BUILD_VECTOR: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+      break;
+    }
+    // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+    // that adds a 128 bits reg copy when going through TwoAddressInstructions
+    // pass. We want to avoid 128 bits copies as much as possible because they
+    // can't be bundled by our scheduler.
+    SDValue RegSeqArgs[9] = {
+      CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
+    };
+    bool IsRegSeq = true;
+    for (unsigned i = 0; i < N->getNumOperands(); i++) {
+      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
+        IsRegSeq = false;
+        break;
+      }
+      RegSeqArgs[2 * i + 1] = N->getOperand(i);
+    }
+    if (!IsRegSeq)
+      break;
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+        RegSeqArgs, 2 * N->getNumOperands() + 1);
+  }
   case ISD::ConstantFP:
   case ISD::Constant: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
@@ -336,17 +365,34 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
     SDValue Operand = Ops[OperandIdx[i] - 1];
     switch (Operand.getOpcode()) {
     case AMDGPUISD::CONST_ADDRESS: {
-      if (i == 2)
-        break;
       SDValue CstOffset;
-      if (!Operand.getValueType().isVector() &&
-          SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
-        Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
-        Ops[SelIdx[i] - 1] = CstOffset;
-        return true;
+      if (Operand.getValueType().isVector() ||
+          !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset))
+        break;
+
+      // Gather others constants values
+      std::vector<unsigned> Consts;
+      for (unsigned j = 0; j < 3; j++) {
+        int SrcIdx = OperandIdx[j];
+        if (SrcIdx < 0)
+          break;
+        if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+          if (Reg->getReg() == AMDGPU::ALU_CONST) {
+            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+            Consts.push_back(Cst->getZExtValue());
+          }
+        }
       }
+
+      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+      Consts.push_back(Cst->getZExtValue());
+      if (!TII->fitsConstReadLimitations(Consts))
+        break;
+
+      Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+      Ops[SelIdx[i] - 1] = CstOffset;
+      return true;
       }
-      break;
     case ISD::FNEG:
       if (NegIdx[i] < 0)
         break;
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index f65e1f3..922cac1 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -33,11 +33,6 @@
 
 using namespace llvm;
 //===----------------------------------------------------------------------===//
-// Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-#include "AMDGPUGenCallingConv.inc"
-
-//===----------------------------------------------------------------------===//
 // TargetLowering Implementation Help Functions End
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp
index 3096c22..0d1de3d 100644
--- a/lib/Target/R600/AMDILSIDevice.cpp
+++ b/lib/Target/R600/AMDILSIDevice.cpp
@@ -36,10 +36,13 @@ AMDGPUSIDevice::getGeneration() const {
 
 std::string
 AMDGPUSIDevice::getDataLayout() const {
-  return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
-      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
-      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
-      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
-      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-      "-n8:16:32:64");
+  return std::string(
+    "e"
+    "-p:64:64:64"
+    "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64"
+    "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128"
+    "-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+    "-v2048:2048:2048"
+    "-n32:64"
+  );
 }
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 00f8b10..63c59e1 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -37,11 +37,10 @@ add_llvm_target(R600CodeGen
   R600ExpandSpecialInstrs.cpp
   R600InstrInfo.cpp
   R600ISelLowering.cpp
-  R600LowerConstCopy.cpp
   R600MachineFunctionInfo.cpp
+  R600MachineScheduler.cpp
   R600RegisterInfo.cpp
   SIAnnotateControlFlow.cpp
-  SIAssignInterpRegs.cpp
   SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 8721f80..cd3a7ce 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -33,15 +33,6 @@ public:
                                      SmallVectorImpl<MCFixup> &Fixups) const {
     return 0;
   }
-
-  virtual unsigned GPR4AlignEncode(const MCInst  &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups) const {
-    return 0;
-  }
-  virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups) const {
-    return 0;
-  }
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index 6cc0077..e27abcc 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -42,9 +42,6 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   const MCSubtargetInfo &STI;
   MCContext &Ctx;
 
-  /// \brief Encode a sequence of registers with the correct alignment.
-  unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
-
   /// \brief Can this operand also contain immediate values?
   bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
 
@@ -65,14 +62,6 @@ public:
   /// \returns the encoding for an MCOperand.
   virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                                      SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// \brief Encoding for when 2 consecutive registers are used
-  virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixup) const;
-
-  /// \brief Encoding for when 4 consectuive registers are used
-  virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixup) const;
 };
 
 } // End anonymous namespace
@@ -212,24 +201,3 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   return 0;
 }
 
-//===----------------------------------------------------------------------===//
-// Custom Operand Encodings
-//===----------------------------------------------------------------------===//
-
-unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
-                                   unsigned shift) const {
-  unsigned regCode = MRI.getEncodingValue(MI.getOperand(OpNo).getReg());
-  return (regCode & 0xff) >> shift;
-}
-
-unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
-                                          unsigned OpNo ,
-                                        SmallVectorImpl<MCFixup> &Fixup) const {
-  return GPRAlign(MI, OpNo, 1);
-}
-
-unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI,
-                                          unsigned OpNo,
-                                        SmallVectorImpl<MCFixup> &Fixup) const {
-  return GPRAlign(MI, OpNo, 2);
-}
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index b5c2a93..a73691d 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -50,8 +50,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
 
-  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
@@ -65,8 +65,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
 
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
@@ -94,6 +94,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::SELECT_CC);
 
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
   setSchedulingPreference(Sched::VLIW);
 }
 
@@ -105,7 +106,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 
   switch (MI->getOpcode()) {
   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
-  case AMDGPU::SHADER_TYPE: break;
   case AMDGPU::CLAMP_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
                                                    AMDGPU::MOV,
@@ -150,7 +150,13 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
                      MI->getOperand(1).getImm());
     break;
-
+  case AMDGPU::CONST_COPY: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
+        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
+        MI->getOperand(1).getImm());
+    break;
+  }
 
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
@@ -215,8 +221,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 
   case AMDGPU::BRANCH:
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
-              .addOperand(MI->getOperand(0))
-              .addReg(0);
+              .addOperand(MI->getOperand(0));
       break;
 
   case AMDGPU::BRANCH_COND_f32: {
@@ -227,7 +232,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
               .addImm(OPCODE_IS_NOT_ZERO)
               .addImm(0); // Flags
     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
             .addOperand(MI->getOperand(0))
             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
@@ -241,7 +246,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
             .addImm(OPCODE_IS_NOT_ZERO_INT)
             .addImm(0); // Flags
     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
            .addOperand(MI->getOperand(0))
             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
@@ -306,11 +311,9 @@ using namespace llvm::AMDGPUIntrinsic;
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
   case ISD::ROTL: return LowerROTL(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SETCC: return LowerSETCC(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
   case ISD::FPOW: return LowerFPOW(Op, DAG);
@@ -470,44 +473,6 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
       );
 }
 
-SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  SDValue CC = Op.getOperand(1);
-  SDValue LHS   = Op.getOperand(2);
-  SDValue RHS   = Op.getOperand(3);
-  SDValue JumpT  = Op.getOperand(4);
-  SDValue CmpValue;
-  SDValue Result;
-
-  if (LHS.getValueType() == MVT::i32) {
-    CmpValue = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::i32,
-        LHS, RHS,
-        DAG.getConstant(-1, MVT::i32),
-        DAG.getConstant(0, MVT::i32),
-        CC);
-  } else if (LHS.getValueType() == MVT::f32) {
-    CmpValue = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::f32,
-        LHS, RHS,
-        DAG.getConstantFP(1.0f, MVT::f32),
-        DAG.getConstantFP(0.0f, MVT::f32),
-        CC);
-  } else {
-    assert(0 && "Not valid type for br_cc");
-  }
-  Result = DAG.getNode(
-      AMDGPUISD::BRANCH_COND,
-      CmpValue.getDebugLoc(),
-      MVT::Other, Chain,
-      JumpT, CmpValue);
-  return Result;
-}
-
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    DebugLoc DL,
                                                    unsigned DwordOffset) const {
@@ -576,12 +541,37 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 
   // Check if we can lower this to a native operation.
 
+  // Try to lower to a SET* instruction:
+  //
+  // SET* can match the following patterns:
+  //
+  // select_cc f32, f32, -1,  0, cc_any
+  // select_cc f32, f32, 1.0f, 0.0f, cc_any
+  // select_cc i32, i32, -1,  0, cc_any
+  //
+
+  // Move hardware True/False values to the correct operand.
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    std::swap(False, True);
+    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+  }
+
+  if (isHWTrueValue(True) && isHWFalseValue(False) &&
+      (CompareVT == VT || VT == MVT::i32)) {
+    // This can be matched by a SET* instruction.
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+  }
+
   // Try to lower to a CND* instruction:
-  // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
-  // can be lowered to CND* instructions can also be lowered to SET*
-  // instructions.  CND* instructions are cheaper, because they dont't
-  // require additional instructions to convert their result to the correct
-  // value type, so this check should be first.
+  //
+  // CND* can match the following patterns:
+  //
+  // select_cc f32, 0.0, f32, f32, cc_any
+  // select_cc f32, 0.0, i32, i32, cc_any
+  // select_cc i32, 0,   f32, f32, cc_any
+  // select_cc i32, 0,   i32, i32, cc_any
+  //
   if (isZero(LHS) || isZero(RHS)) {
     SDValue Cond = (isZero(LHS) ? RHS : LHS);
     SDValue Zero = (isZero(LHS) ? LHS : RHS);
@@ -623,38 +613,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   }
 
-  // Try to lower to a SET* instruction:
-  //
-  // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware,
-  // but for the other case where CompareVT != VT, all operands of
-  // SELECT_CC need to have the same value type, so we need to change True and
-  // False to be the same type as LHS and RHS, and then convert the result of
-  // the select_cc back to the correct type.
-
-  // Move hardware True/False values to the correct operand.
-  if (isHWTrueValue(False) && isHWFalseValue(True)) {
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-    std::swap(False, True);
-    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
-  }
-
-  if (isHWTrueValue(True) && isHWFalseValue(False)) {
-    if (CompareVT !=  VT && VT == MVT::f32 && CompareVT == MVT::i32) {
-      SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
-          LHS, RHS,
-          DAG.getConstant(-1, MVT::i32),
-          DAG.getConstant(0, MVT::i32),
-          CC);
-      // Convert integer values of true (-1) and false (0) to fp values of
-      // true (1.0f) and false (0.0f).
-      SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
-                                                DAG.getConstant(1, MVT::i32));
-      return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
-    } else {
-      // This SELECT_CC is already legal.
-      return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
-    }
-  }
 
   // Possible Min/Max pattern
   SDValue MinMax = LowerMinMax(Op, DAG);
@@ -698,48 +656,6 @@ SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       DAG.getCondCode(ISD::SETNE));
 }
 
-SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Cond;
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue CC  = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
-  assert(Op.getValueType() == MVT::i32);
-  if (LHS.getValueType() == MVT::i32) {
-    Cond = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::i32,
-        LHS, RHS,
-        DAG.getConstant(-1, MVT::i32),
-        DAG.getConstant(0, MVT::i32),
-        CC);
-  } else if (LHS.getValueType() == MVT::f32) {
-    Cond = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::f32,
-        LHS, RHS,
-        DAG.getConstantFP(1.0f, MVT::f32),
-        DAG.getConstantFP(0.0f, MVT::f32),
-        CC);
-    Cond = DAG.getNode(
-        ISD::FP_TO_SINT,
-        DL,
-        MVT::i32,
-        Cond);
-  } else {
-    assert(0 && "Not valid type for set_cc");
-  }
-  Cond = DAG.getNode(
-      ISD::AND,
-      DL,
-      MVT::i32,
-      DAG.getConstant(1, MVT::i32),
-      Cond);
-  return Cond;
-}
-
 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
@@ -918,7 +834,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   if (ConstantBlock > -1) {
     SDValue Result;
     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
-        dyn_cast<Constant>(LoadNode->getSrcValue())) {
+        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
+        dyn_cast<ConstantSDNode>(Ptr)) {
       SDValue Slots[4];
       for (unsigned i = 0; i < 4; i++) {
         // We want Const position encoded with the following formula :
@@ -934,7 +851,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     } else {
       // non constant ptr cant be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
-          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
+          DAG.getConstant(LoadNode->getAddressSpace() -
+	                  AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
           );
     }
 
@@ -1122,6 +1041,9 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT_CC: {
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
     //      selectcc x, y, a, b, inv(cc)
+    //
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+    //      selectcc x, y, a, b, cc
     SDValue LHS = N->getOperand(0);
     if (LHS.getOpcode() != ISD::SELECT_CC) {
       return SDValue();
@@ -1130,24 +1052,30 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     SDValue RHS = N->getOperand(1);
     SDValue True = N->getOperand(2);
     SDValue False = N->getOperand(3);
+    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
 
     if (LHS.getOperand(2).getNode() != True.getNode() ||
         LHS.getOperand(3).getNode() != False.getNode() ||
-        RHS.getNode() != False.getNode() ||
-        cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) {
+        RHS.getNode() != False.getNode()) {
       return SDValue();
     }
 
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get();
-    CCOpcode = ISD::getSetCCInverse(
-                        CCOpcode, LHS.getOperand(0).getValueType().isInteger());
-    return DAG.getSelectCC(N->getDebugLoc(),
-                           LHS.getOperand(0),
-                           LHS.getOperand(1),
-                           LHS.getOperand(2),
-                           LHS.getOperand(3),
-                           CCOpcode);
+    switch (NCC) {
+    default: return SDValue();
+    case ISD::SETNE: return LHS;
+    case ISD::SETEQ: {
+      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+      LHSCC = ISD::getSetCCInverse(LHSCC,
+                                  LHS.getOperand(0).getValueType().isInteger());
+      return DAG.getSelectCC(N->getDebugLoc(),
+                             LHS.getOperand(0),
+                             LHS.getOperand(1),
+                             LHS.getOperand(2),
+                             LHS.getOperand(3),
+                             LHSCC);
     }
+    }
+  }
   case AMDGPUISD::EXPORT: {
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index afa3897..5cb4b91 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -52,14 +52,11 @@ private:
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
 
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-
   /// \brief Lower ROTL opcode to BITALIGN
   SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 7e3f005..0865098 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -139,6 +139,60 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
           (TargetFlags & R600_InstFlag::OP3));
 }
 
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
+    const {
+  assert (Consts.size() <= 12 && "Too many operands in instructions group");
+  unsigned Pair1 = 0, Pair2 = 0;
+  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+    unsigned ReadConstHalf = Consts[i] & 2;
+    unsigned ReadConstIndex = Consts[i] & (~3);
+    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
+    if (!Pair1) {
+      Pair1 = ReadHalfConst;
+      continue;
+    }
+    if (Pair1 == ReadHalfConst)
+      continue;
+    if (!Pair2) {
+      Pair2 = ReadHalfConst;
+      continue;
+    }
+    if (Pair2 != ReadHalfConst)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
+  std::vector<unsigned> Consts;
+  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
+    const MachineInstr *MI = MIs[i];
+
+    const R600Operands::Ops OpTable[3][2] = {
+      {R600Operands::SRC0, R600Operands::SRC0_SEL},
+      {R600Operands::SRC1, R600Operands::SRC1_SEL},
+      {R600Operands::SRC2, R600Operands::SRC2_SEL},
+    };
+
+    if (!isALUInstr(MI->getOpcode()))
+      continue;
+
+    for (unsigned j = 0; j < 3; j++) {
+      int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+      if (SrcIdx < 0)
+        break;
+      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
+        unsigned Const = MI->getOperand(
+            getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+        Consts.push_back(Const);
+      }
+    }
+  }
+  return fitsConstReadLimitations(Consts);
+}
+
 DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
     const ScheduleDAG *DAG) const {
   const InstrItineraryData *II = TM->getInstrItineraryData();
@@ -168,6 +222,11 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
   return NULL;
 }
 
+static
+bool isJump(unsigned Opcode) {
+  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+}
+
 bool
 R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                              MachineBasicBlock *&TBB,
@@ -186,7 +245,7 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return false;
     --I;
   }
-  if (static_cast<MachineInstr *>(I)->getOpcode() != AMDGPU::JUMP) {
+  if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
     return false;
   }
 
@@ -196,22 +255,20 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst->getOpcode();
   if (I == MBB.begin() ||
-      static_cast<MachineInstr *>(--I)->getOpcode() != AMDGPU::JUMP) {
+          !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
     if (LastOpc == AMDGPU::JUMP) {
-      if(!isPredicated(LastInst)) {
-        TBB = LastInst->getOperand(0).getMBB();
-        return false;
-      } else {
-        MachineInstr *predSet = I;
-        while (!isPredicateSetter(predSet->getOpcode())) {
-          predSet = --I;
-        }
-        TBB = LastInst->getOperand(0).getMBB();
-        Cond.push_back(predSet->getOperand(1));
-        Cond.push_back(predSet->getOperand(2));
-        Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
-        return false;
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastOpc == AMDGPU::JUMP_COND) {
+      MachineInstr *predSet = I;
+      while (!isPredicateSetter(predSet->getOpcode())) {
+        predSet = --I;
       }
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(predSet->getOperand(1));
+      Cond.push_back(predSet->getOperand(2));
+      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+      return false;
     }
     return true;  // Can't handle indirect branch.
   }
@@ -221,10 +278,7 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   unsigned SecondLastOpc = SecondLastInst->getOpcode();
 
   // If the block ends with a B and a Bcc, handle it.
-  if (SecondLastOpc == AMDGPU::JUMP &&
-      isPredicated(SecondLastInst) &&
-      LastOpc == AMDGPU::JUMP &&
-      !isPredicated(LastInst)) {
+  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
     MachineInstr *predSet = --I;
     while (!isPredicateSetter(predSet->getOpcode())) {
       predSet = --I;
@@ -261,7 +315,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
 
   if (FBB == 0) {
     if (Cond.empty()) {
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0);
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
       return 1;
     } else {
       MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
@@ -269,7 +323,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
       addFlag(PredSet, 0, MO_FLAG_PUSH);
       PredSet->getOperand(2).setImm(Cond[1].getImm());
 
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP))
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
              .addMBB(TBB)
              .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
       return 1;
@@ -279,10 +333,10 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
     assert(PredSet && "No previous predicate !");
     addFlag(PredSet, 0, MO_FLAG_PUSH);
     PredSet->getOperand(2).setImm(Cond[1].getImm());
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP))
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
             .addMBB(TBB)
             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0);
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
     return 2;
   }
 }
@@ -302,11 +356,13 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   switch (I->getOpcode()) {
   default:
     return 0;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    break;
+  }
   case AMDGPU::JUMP:
-    if (isPredicated(I)) {
-      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-      clearFlag(predSet, 0, MO_FLAG_PUSH);
-    }
     I->eraseFromParent();
     break;
   }
@@ -320,11 +376,13 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     // FIXME: only one case??
   default:
     return 1;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    break;
+  }
   case AMDGPU::JUMP:
-    if (isPredicated(I)) {
-      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-      clearFlag(predSet, 0, MO_FLAG_PUSH);
-    }
     I->eraseFromParent();
     break;
   }
@@ -356,6 +414,8 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const {
 
   if (MI->getOpcode() == AMDGPU::KILLGT) {
     return false;
+  } else if (isVector(*MI)) {
+    return false;
   } else {
     return AMDGPUInstrInfo::isPredicable(MI);
   }
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index efe721c..bf9569e 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -53,6 +53,9 @@ namespace llvm {
   /// \returns true if this \p Opcode represents an ALU instruction.
   bool isALUInstr(unsigned Opcode) const;
 
+  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
+  bool canBundle(const std::vector<MachineInstr *> &) const;
+
   /// \breif Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8242df9..8c50d54 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -512,8 +512,8 @@ def INTERP_PAIR_ZW :  AMDGPUShaderInst <
   []>;
 
 def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
-  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
-  [SDNPMayLoad]
+  SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+  [SDNPVariadic]
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1090,12 +1090,12 @@ class COS_Common <bits<11> inst> : R600_1OP <
 multiclass DIV_Common <InstR600 recip_ieee> {
 def : Pat<
   (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
-  (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+  (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
 >;
 
 def : Pat<
   (fdiv R600_Reg32:$src0, R600_Reg32:$src1),
-  (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+  (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
 >;
 }
 
@@ -1169,12 +1169,12 @@ let Predicates = [isR600] in {
 // cards.
 class COS_PAT <InstR600 trig> : Pat<
   (fcos R600_Reg32:$src),
-  (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+  (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
 >;
 
 class SIN_PAT <InstR600 trig> : Pat<
   (fsin R600_Reg32:$src),
-  (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+  (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1587,19 +1587,28 @@ def PRED_X : InstR600 <
   (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
   "", [], NullALU> {
   let FlagOperandIdx = 3;
-  let isTerminator = 1;
 }
 
-let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
-
-def JUMP : InstR600 <0x10,
+let isTerminator = 1, isBranch = 1 in {
+def JUMP_COND : InstR600 <0x10,
           (outs),
-          (ins brtarget:$target, R600_Pred:$p),
+          (ins brtarget:$target, R600_Predicate_Bit:$p),
           "JUMP $target ($p)",
           [], AnyALU
   >;
 
-}  // End isTerminator = 1, isBranch = 1, isBarrier = 1
+def JUMP : InstR600 <0x10,
+          (outs),
+          (ins brtarget:$target),
+          "JUMP $target",
+          [], AnyALU
+  >
+{
+  let isPredicable = 1;
+  let isBarrier = 1;
+}
+
+}  // End isTerminator = 1, isBranch = 1
 
 let usesCustomInserter = 1 in {
 
@@ -1639,7 +1648,7 @@ def FNEG_R600 : FNEG<R600_Reg32>;
 //===---------------------------------------------------------------------===//
 // Return instruction
 //===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1,
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
     usesCustomInserter = 1 in {
   def RETURN          : ILFormat<(outs), (ins variable_ops),
       "RETURN", [(IL_retflag)]>;
@@ -1650,27 +1659,27 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1,
 // Constant Buffer Addressing Support
 //===----------------------------------------------------------------------===//
 
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
 def CONST_COPY : Instruction {
   let OutOperandList = (outs R600_Reg32:$dst);
   let InOperandList = (ins i32imm:$src);
-  let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+  let Pattern =
+      [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
   let AsmString = "CONST_COPY";
   let neverHasSideEffects = 1;
   let isAsCheapAsAMove = 1;
   let Itinerary = NullALU;
 }
-} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
 
 def TEX_VTX_CONSTBUF :
-  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr",
-      [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>,
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
+      [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
   VTX_WORD1_GPR, VTX_WORD0 {
 
   let VC_INST = 0;
   let FETCH_TYPE = 2;
   let FETCH_WHOLE_QUAD = 0;
-  let BUFFER_ID = 0;
   let SRC_REL = 0;
   let SRC_SEL_X = 0;
   let DST_REL = 0;
@@ -1840,6 +1849,18 @@ let isTerminator=1 in {
 // ISel Patterns
 //===----------------------------------------------------------------------===//
 
+// CND*_INT Pattterns for f32 True / False values
+
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+  (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1),
+                                            R600_Reg32:$src2, cc),
+  (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
+>;
+
+def : CND_INT_f32 <CNDE_INT,  SETEQ>;
+def : CND_INT_f32 <CNDGT_INT, SETGT>;
+def : CND_INT_f32 <CNDGE_INT, SETGE>;
+
 //CNDGE_INT extra pattern
 def : Pat <
   (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
@@ -1958,8 +1979,8 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>;
 def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>;
 def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
 
-def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
-def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
+def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
+def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
 
 // bitconvert patterns
 
diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp
deleted file mode 100644
index 3ebe653..0000000
--- a/lib/Target/R600/R600LowerConstCopy.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr.
-/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot
-/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits
-/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try
-/// to fold them if possible or replace them by MOV otherwise.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "R600InstrInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/GlobalValue.h"
-
-namespace llvm {
-
-class R600LowerConstCopy : public MachineFunctionPass {
-private:
-  static char ID;
-  const R600InstrInfo *TII;
-
-  struct ConstPairs {
-    unsigned XYPair;
-    unsigned ZWPair;
-  };
-
-  bool canFoldInBundle(ConstPairs &UsedConst, unsigned ReadConst) const;
-public:
-  R600LowerConstCopy(TargetMachine &tm);
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; }
-};
-
-char R600LowerConstCopy::ID = 0;
-
-R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) :
-    MachineFunctionPass(ID),
-    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo()))
-{
-}
-
-bool R600LowerConstCopy::canFoldInBundle(ConstPairs &UsedConst,
-    unsigned ReadConst) const {
-  unsigned ReadConstChan = ReadConst & 3;
-  unsigned ReadConstIndex = ReadConst & (~3);
-  if (ReadConstChan < 2) {
-    if (!UsedConst.XYPair) {
-      UsedConst.XYPair = ReadConstIndex;
-    }
-    return UsedConst.XYPair == ReadConstIndex;
-  } else {
-    if (!UsedConst.ZWPair) {
-      UsedConst.ZWPair = ReadConstIndex;
-    }
-    return UsedConst.ZWPair == ReadConstIndex;
-  }
-}
-
-static bool isControlFlow(const MachineInstr &MI) {
-  return (MI.getOpcode() == AMDGPU::IF_PREDICATE_SET) ||
-  (MI.getOpcode() == AMDGPU::ENDIF) ||
-  (MI.getOpcode() == AMDGPU::ELSE) ||
-  (MI.getOpcode() == AMDGPU::WHILELOOP) ||
-  (MI.getOpcode() == AMDGPU::BREAK);
-}
-
-bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) {
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    DenseMap<unsigned, MachineInstr *> RegToConstIndex;
-    for (MachineBasicBlock::instr_iterator I = MBB.instr_begin(),
-        E = MBB.instr_end(); I != E;) {
-
-      if (I->getOpcode() == AMDGPU::CONST_COPY) {
-        MachineInstr &MI = *I;
-        I = llvm::next(I);
-        unsigned DstReg = MI.getOperand(0).getReg();
-        DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
-            RegToConstIndex.find(DstReg);
-        if (SrcMI != RegToConstIndex.end()) {
-          SrcMI->second->eraseFromParent();
-          RegToConstIndex.erase(SrcMI);
-        }
-        MachineInstr *NewMI = 
-            TII->buildDefaultInstruction(MBB, &MI, AMDGPU::MOV,
-            MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
-        TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
-            MI.getOperand(1).getImm());
-        RegToConstIndex[DstReg] = NewMI;
-        MI.eraseFromParent();
-        continue;
-      }
-
-      std::vector<unsigned> Defs;
-      // We consider all Instructions as bundled because algorithm that  handle
-      // const read port limitations inside an IG is still valid with single
-      // instructions.
-      std::vector<MachineInstr *> Bundle;
-
-      if (I->isBundle()) {
-        unsigned BundleSize = I->getBundleSize();
-        for (unsigned i = 0; i < BundleSize; i++) {
-          I = llvm::next(I);
-          Bundle.push_back(I);
-        }
-      } else if (TII->isALUInstr(I->getOpcode())){
-        Bundle.push_back(I);
-      } else if (isControlFlow(*I)) {
-          RegToConstIndex.clear();
-          I = llvm::next(I);
-          continue;
-      } else {
-        MachineInstr &MI = *I;
-        for (MachineInstr::mop_iterator MOp = MI.operands_begin(),
-            MOpE = MI.operands_end(); MOp != MOpE; ++MOp) {
-          MachineOperand &MO = *MOp;
-          if (!MO.isReg())
-            continue;
-          if (MO.isDef()) {
-            Defs.push_back(MO.getReg());
-          } else {
-            // Either a TEX or an Export inst, prevent from erasing def of used
-            // operand
-            RegToConstIndex.erase(MO.getReg());
-            for (MCSubRegIterator SR(MO.getReg(), &TII->getRegisterInfo());
-                SR.isValid(); ++SR) {
-              RegToConstIndex.erase(*SR);
-            }
-          }
-        }
-      }
-
-
-      R600Operands::Ops OpTable[3][2] = {
-        {R600Operands::SRC0, R600Operands::SRC0_SEL},
-        {R600Operands::SRC1, R600Operands::SRC1_SEL},
-        {R600Operands::SRC2, R600Operands::SRC2_SEL},
-      };
-
-      for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
-          ItE = Bundle.end(); It != ItE; ++It) {
-        MachineInstr *MI = *It;
-        if (TII->isPredicated(MI)) {
-          // We don't want to erase previous assignment
-          RegToConstIndex.erase(MI->getOperand(0).getReg());
-        } else {
-          int WriteIDX = TII->getOperandIdx(MI->getOpcode(), R600Operands::WRITE);
-          if (WriteIDX < 0 || MI->getOperand(WriteIDX).getImm())
-            Defs.push_back(MI->getOperand(0).getReg());
-        }
-      }
-
-      ConstPairs CP = {0,0};
-      for (unsigned SrcOp = 0; SrcOp < 3; SrcOp++) {
-        for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
-            ItE = Bundle.end(); It != ItE; ++It) {
-          MachineInstr *MI = *It;
-          int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[SrcOp][0]);
-          if (SrcIdx < 0)
-            continue;
-          MachineOperand &MO = MI->getOperand(SrcIdx);
-          DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
-              RegToConstIndex.find(MO.getReg());
-          if (SrcMI != RegToConstIndex.end()) {
-            MachineInstr *CstMov = SrcMI->second;
-            int ConstMovSel =
-                TII->getOperandIdx(CstMov->getOpcode(), R600Operands::SRC0_SEL);
-            unsigned ConstIndex = CstMov->getOperand(ConstMovSel).getImm();
-            if (MI->isInsideBundle() && canFoldInBundle(CP, ConstIndex)) {
-              TII->setImmOperand(MI, OpTable[SrcOp][1], ConstIndex);
-              MI->getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST);
-            } else {
-              RegToConstIndex.erase(SrcMI);
-            }
-          }
-        }
-      }
-
-      for (std::vector<unsigned>::iterator It = Defs.begin(), ItE = Defs.end();
-          It != ItE; ++It) {
-        DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
-            RegToConstIndex.find(*It);
-        if (SrcMI != RegToConstIndex.end()) {
-          SrcMI->second->eraseFromParent();
-          RegToConstIndex.erase(SrcMI);
-        }
-      }
-      I = llvm::next(I);
-    }
-
-    if (MBB.succ_empty()) {
-      for (DenseMap<unsigned, MachineInstr *>::iterator
-          DI = RegToConstIndex.begin(), DE = RegToConstIndex.end();
-          DI != DE; ++DI) {
-        DI->second->eraseFromParent();
-      }
-    }
-  }
-  return false;
-}
-
-FunctionPass *createR600LowerConstCopy(TargetMachine &tm) {
-  return new R600LowerConstCopy(tm);
-}
-
-}
-
-
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
index 40aec83..b07a585 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.cpp
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -14,5 +14,4 @@ using namespace llvm;
 
 R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
   : MachineFunctionInfo() {
-    memset(Outputs, 0, sizeof(Outputs));
   }
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index 4b901f4..13a46b8 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -26,7 +26,6 @@ public:
   R600MachineFunctionInfo(const MachineFunction &MF);
   SmallVector<unsigned, 4> LiveOuts;
   std::vector<unsigned> IndirectRegs;
-  SDNode *Outputs[16];
 };
 
 } // End llvm namespace
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
new file mode 100644
index 0000000..9074364
--- /dev/null
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -0,0 +1,427 @@
+//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "R600MachineScheduler.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+
+using namespace llvm;
+
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+
+  DAG = dag;
+  TII = static_cast<const R600InstrInfo*>(DAG->TII);
+  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  MRI = &DAG->MRI;
+  Available[IDAlu]->clear();
+  Available[IDFetch]->clear();
+  Available[IDOther]->clear();
+  CurInstKind = IDOther;
+  CurEmitted = 0;
+  OccupedSlotsMask = 15;
+  InstKindLimit[IDAlu] = 120; // 120 minus 8 for security
+
+
+  const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) {
+    InstKindLimit[IDFetch] = 7; // 8 minus 1 for security
+  } else {
+    InstKindLimit[IDFetch] = 15; // 16 minus 1 for security
+  }
+}
+
+void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst)
+{
+  if (QSrc->empty())
+    return;
+  for (ReadyQueue::iterator I = QSrc->begin(),
+      E = QSrc->end(); I != E; ++I) {
+    (*I)->NodeQueueId &= ~QSrc->getID();
+    QDst->push(*I);
+  }
+  QSrc->clear();
+}
+
+SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
+  SUnit *SU = 0;
+  IsTopNode = true;
+  NextInstKind = IDOther;
+
+  // check if we might want to switch current clause type
+  bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
+      (CurEmitted > InstKindLimit[CurInstKind]) ||
+      (Available[CurInstKind]->empty());
+  bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&
+      (!Available[IDFetch]->empty() || !Available[IDOther]->empty());
+
+  if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+      (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
+    // try to pick ALU
+    SU = pickAlu();
+    if (SU) {
+      if (CurEmitted >  InstKindLimit[IDAlu])
+        CurEmitted = 0;
+      NextInstKind = IDAlu;
+    }
+  }
+
+  if (!SU) {
+    // try to pick FETCH
+    SU = pickOther(IDFetch);
+    if (SU)
+      NextInstKind = IDFetch;
+  }
+
+  // try to pick other
+  if (!SU) {
+    SU = pickOther(IDOther);
+    if (SU)
+      NextInstKind = IDOther;
+  }
+
+  DEBUG(
+      if (SU) {
+        dbgs() << "picked node: ";
+        SU->dump(DAG);
+      } else {
+        dbgs() << "NO NODE ";
+        for (int i = 0; i < IDLast; ++i) {
+          Available[i]->dump();
+          Pending[i]->dump();
+        }
+        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+          const SUnit &S = DAG->SUnits[i];
+          if (!S.isScheduled)
+            S.dump(DAG);
+        }
+      }
+  );
+
+  return SU;
+}
+
+void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+
+  DEBUG(dbgs() << "scheduled: ");
+  DEBUG(SU->dump(DAG));
+
+  if (NextInstKind != CurInstKind) {
+    DEBUG(dbgs() << "Instruction Type Switch\n");
+    if (NextInstKind != IDAlu)
+      OccupedSlotsMask = 15;
+    CurEmitted = 0;
+    CurInstKind = NextInstKind;
+  }
+
+  if (CurInstKind == IDAlu) {
+    switch (getAluKind(SU)) {
+    case AluT_XYZW:
+      CurEmitted += 4;
+      break;
+    case AluDiscarded:
+      break;
+    default: {
+      ++CurEmitted;
+      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
+          E = SU->getInstr()->operands_end(); It != E; ++It) {
+        MachineOperand &MO = *It;
+        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+          ++CurEmitted;
+      }
+    }
+    }
+  } else {
+    ++CurEmitted;
+  }
+
+
+  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+
+  if (CurInstKind != IDFetch) {
+    MoveUnits(Pending[IDFetch], Available[IDFetch]);
+  }
+  MoveUnits(Pending[IDOther], Available[IDOther]);
+}
+
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+  int IK = getInstKind(SU);
+
+  DEBUG(dbgs() << IK << " <= ");
+  DEBUG(SU->dump(DAG));
+
+  Pending[IK]->push(SU);
+}
+
+void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+}
+
+bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+                                          const TargetRegisterClass *RC) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    return RC->contains(Reg);
+  } else {
+    return MRI->getRegClass(Reg) == RC;
+  }
+}
+
+R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
+  MachineInstr *MI = SU->getInstr();
+
+    switch (MI->getOpcode()) {
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+      return AluT_XYZW;
+    case AMDGPU::COPY:
+      if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
+        // %vregX = COPY Tn_X is likely to be discarded in favor of an
+        // assignement of Tn_X to %vregX, don't considers it in scheduling
+        return AluDiscarded;
+      }
+      else if (MI->getOperand(1).isUndef()) {
+        // MI will become a KILL, don't considers it in scheduling
+        return AluDiscarded;
+      }
+    default:
+      break;
+    }
+
+    // Does the instruction take a whole IG ?
+    if(TII->isVector(*MI) ||
+        TII->isCubeOp(MI->getOpcode()) ||
+        TII->isReductionOp(MI->getOpcode()))
+      return AluT_XYZW;
+
+    // Is the result already assigned to a channel ?
+    unsigned DestSubReg = MI->getOperand(0).getSubReg();
+    switch (DestSubReg) {
+    case AMDGPU::sub0:
+      return AluT_X;
+    case AMDGPU::sub1:
+      return AluT_Y;
+    case AMDGPU::sub2:
+      return AluT_Z;
+    case AMDGPU::sub3:
+      return AluT_W;
+    default:
+      break;
+    }
+
+    // Is the result already member of a X/Y/Z/W class ?
+    unsigned DestReg = MI->getOperand(0).getReg();
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+      return AluT_X;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+      return AluT_Y;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+      return AluT_Z;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+      return AluT_W;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+      return AluT_XYZW;
+
+    return AluAny;
+
+}
+
+int R600SchedStrategy::getInstKind(SUnit* SU) {
+  int Opcode = SU->getInstr()->getOpcode();
+
+  if (TII->isALUInstr(Opcode)) {
+    return IDAlu;
+  }
+
+  switch (Opcode) {
+  case AMDGPU::COPY:
+  case AMDGPU::CONST_COPY:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT4_eg_pseudo:
+  case AMDGPU::DOT4_r600_pseudo:
+    return IDAlu;
+  case AMDGPU::TEX_VTX_CONSTBUF:
+  case AMDGPU::TEX_VTX_TEXBUF:
+  case AMDGPU::TEX_LD:
+  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+  case AMDGPU::TEX_GET_GRADIENTS_H:
+  case AMDGPU::TEX_GET_GRADIENTS_V:
+  case AMDGPU::TEX_SET_GRADIENTS_H:
+  case AMDGPU::TEX_SET_GRADIENTS_V:
+  case AMDGPU::TEX_SAMPLE:
+  case AMDGPU::TEX_SAMPLE_C:
+  case AMDGPU::TEX_SAMPLE_L:
+  case AMDGPU::TEX_SAMPLE_C_L:
+  case AMDGPU::TEX_SAMPLE_LB:
+  case AMDGPU::TEX_SAMPLE_C_LB:
+  case AMDGPU::TEX_SAMPLE_G:
+  case AMDGPU::TEX_SAMPLE_C_G:
+  case AMDGPU::TXD:
+  case AMDGPU::TXD_SHADOW:
+    return IDFetch;
+  default:
+    DEBUG(
+        dbgs() << "other inst: ";
+        SU->dump(DAG);
+    );
+    return IDOther;
+  }
+}
+
+SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
+  if (Q.empty())
+    return NULL;
+  for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end();
+      It != E; ++It) {
+    SUnit *SU = *It;
+    InstructionsGroupCandidate.push_back(SU->getInstr());
+    if (TII->canBundle(InstructionsGroupCandidate)) {
+      InstructionsGroupCandidate.pop_back();
+      Q.erase(It);
+      return SU;
+    } else {
+      InstructionsGroupCandidate.pop_back();
+    }
+  }
+  return NULL;
+}
+
+void R600SchedStrategy::LoadAlu() {
+  ReadyQueue *QSrc = Pending[IDAlu];
+  for (ReadyQueue::iterator I = QSrc->begin(),
+        E = QSrc->end(); I != E; ++I) {
+      (*I)->NodeQueueId &= ~QSrc->getID();
+      AluKind AK = getAluKind(*I);
+      AvailableAlus[AK].insert(*I);
+    }
+    QSrc->clear();
+}
+
+void R600SchedStrategy::PrepareNextSlot() {
+  DEBUG(dbgs() << "New Slot\n");
+  assert (OccupedSlotsMask && "Slot wasn't filled");
+  OccupedSlotsMask = 0;
+  InstructionsGroupCandidate.clear();
+  LoadAlu();
+}
+
+void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
+  unsigned DestReg = MI->getOperand(0).getReg();
+  // PressureRegister crashes if an operand is def and used in the same inst
+  // and we try to constraint its regclass
+  for (MachineInstr::mop_iterator It = MI->operands_begin(),
+      E = MI->operands_end(); It != E; ++It) {
+    MachineOperand &MO = *It;
+    if (MO.isReg() && !MO.isDef() &&
+        MO.getReg() == MI->getOperand(0).getReg())
+      return;
+  }
+  // Constrains the regclass of DestReg to assign it to Slot
+  switch (Slot) {
+  case 0:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+    break;
+  case 1:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+    break;
+  case 2:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+    break;
+  case 3:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+    break;
+  }
+}
+
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
+  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
+  if (!UnslotedSU) {
+    return SlotedSU;
+  } else if (!SlotedSU) {
+    AssignSlot(UnslotedSU->getInstr(), Slot);
+    return UnslotedSU;
+  } else {
+    //Determine which one to pick (the lesser one)
+    if (CompareSUnit()(SlotedSU, UnslotedSU)) {
+      AvailableAlus[AluAny].insert(UnslotedSU);
+      return SlotedSU;
+    } else {
+      AvailableAlus[IndexToID[Slot]].insert(SlotedSU);
+      AssignSlot(UnslotedSU->getInstr(), Slot);
+      return UnslotedSU;
+    }
+  }
+}
+
+bool R600SchedStrategy::isAvailablesAluEmpty() const {
+  return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() &&
+      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
+      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
+      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();
+}
+
+SUnit* R600SchedStrategy::pickAlu() {
+  while (!isAvailablesAluEmpty()) {
+    if (!OccupedSlotsMask) {
+      // Flush physical reg copies (RA will discard them)
+      if (!AvailableAlus[AluDiscarded].empty()) {
+        OccupedSlotsMask = 15;
+        return PopInst(AvailableAlus[AluDiscarded]);
+      }
+      // If there is a T_XYZW alu available, use it
+      if (!AvailableAlus[AluT_XYZW].empty()) {
+        OccupedSlotsMask = 15;
+        return PopInst(AvailableAlus[AluT_XYZW]);
+      }
+    }
+    for (unsigned Chan = 0; Chan < 4; ++Chan) {
+      bool isOccupied = OccupedSlotsMask & (1 << Chan);
+      if (!isOccupied) {
+        SUnit *SU = AttemptFillSlot(Chan);
+        if (SU) {
+          OccupedSlotsMask |= (1 << Chan);
+          InstructionsGroupCandidate.push_back(SU->getInstr());
+          return SU;
+        }
+      }
+    }
+    PrepareNextSlot();
+  }
+  return NULL;
+}
+
+SUnit* R600SchedStrategy::pickOther(int QID) {
+  SUnit *SU = 0;
+  ReadyQueue *AQ = Available[QID];
+
+  if (AQ->empty()) {
+    MoveUnits(Pending[QID], AQ);
+  }
+  if (!AQ->empty()) {
+    SU = *AQ->begin();
+    AQ->remove(AQ->begin());
+  }
+  return SU;
+}
+
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
new file mode 100644
index 0000000..3d0367f
--- /dev/null
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -0,0 +1,120 @@
+//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINESCHEDULER_H_
+#define R600MACHINESCHEDULER_H_
+
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/PriorityQueue.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class CompareSUnit {
+public:
+  bool operator()(const SUnit *S1, const SUnit *S2) {
+    return S1->getDepth() > S2->getDepth();
+  }
+};
+
+class R600SchedStrategy : public MachineSchedStrategy {
+
+  const ScheduleDAGMI *DAG;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  enum InstQueue {
+    QAlu = 1,
+    QFetch = 2,
+    QOther = 4
+  };
+
+  enum InstKind {
+    IDAlu,
+    IDFetch,
+    IDOther,
+    IDLast
+  };
+
+  enum AluKind {
+    AluAny,
+    AluT_X,
+    AluT_Y,
+    AluT_Z,
+    AluT_W,
+    AluT_XYZW,
+    AluDiscarded, // LLVM Instructions that are going to be eliminated
+    AluLast
+  };
+
+  ReadyQueue *Available[IDLast], *Pending[IDLast];
+  std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast];
+
+  InstKind CurInstKind;
+  int CurEmitted;
+  InstKind NextInstKind;
+
+  int InstKindLimit[IDLast];
+
+  int OccupedSlotsMask;
+
+public:
+  R600SchedStrategy() :
+    DAG(0), TII(0), TRI(0), MRI(0) {
+    Available[IDAlu] = new ReadyQueue(QAlu, "AAlu");
+    Available[IDFetch] = new ReadyQueue(QFetch, "AFetch");
+    Available[IDOther] = new ReadyQueue(QOther, "AOther");
+    Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu");
+    Pending[IDFetch] = new ReadyQueue(QFetch<<4, "PFetch");
+    Pending[IDOther] = new ReadyQueue(QOther<<4, "POther");
+  }
+
+  virtual ~R600SchedStrategy() {
+    for (unsigned I = 0; I < IDLast; ++I) {
+      delete Available[I];
+      delete Pending[I];
+    }
+  }
+
+  virtual void initialize(ScheduleDAGMI *dag);
+  virtual SUnit *pickNode(bool &IsTopNode);
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+  virtual void releaseTopNode(SUnit *SU);
+  virtual void releaseBottomNode(SUnit *SU);
+
+private:
+  std::vector<MachineInstr *> InstructionsGroupCandidate;
+
+  int getInstKind(SUnit *SU);
+  bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+  AluKind getAluKind(SUnit *SU) const;
+  void LoadAlu();
+  bool isAvailablesAluEmpty() const;
+  SUnit *AttemptFillSlot (unsigned Slot);
+  void PrepareNextSlot();
+  SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q);
+
+  void AssignSlot(MachineInstr *MI, unsigned Slot);
+  SUnit* pickAlu();
+  SUnit* pickOther(int QID);
+  void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
+};
+
+} // namespace llvm
+
+#endif /* R600MACHINESCHEDULER_H_ */
diff --git a/lib/Target/R600/SIAssignInterpRegs.cpp b/lib/Target/R600/SIAssignInterpRegs.cpp
deleted file mode 100644
index 832e44d..0000000
--- a/lib/Target/R600/SIAssignInterpRegs.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass maps the pseudo interpolation registers to the correct physical
-/// registers.
-//
-/// Prior to executing a fragment shader, the GPU loads interpolation
-/// parameters into physical registers.  The specific physical register that each
-/// interpolation parameter ends up in depends on the type of the interpolation
-/// parameter as well as how many interpolation parameters are used by the
-/// shader.
-//
-//===----------------------------------------------------------------------===//
-
-
-
-#include "AMDGPU.h"
-#include "AMDIL.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIAssignInterpRegsPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  TargetMachine &TM;
-
-  void addLiveIn(MachineFunction * MF,  MachineRegisterInfo & MRI,
-                 unsigned physReg, unsigned virtReg);
-
-public:
-  SIAssignInterpRegsPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TM(tm) { }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  const char *getPassName() const { return "SI Assign intrpolation registers"; }
-};
-
-} // End anonymous namespace
-
-char SIAssignInterpRegsPass::ID = 0;
-
-#define INTERP_VALUES 16
-#define REQUIRED_VALUE_MAX_INDEX 7
-
-struct InterpInfo {
-  bool Enabled;
-  unsigned Regs[3];
-  unsigned RegCount;
-};
-
-
-FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
-  return new SIAssignInterpRegsPass(tm);
-}
-
-bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) {
-
-  struct InterpInfo InterpUse[INTERP_VALUES] = {
-    {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2},
-    {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2},
-    {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2},
-    {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3},
-    {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2},
-    {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2},
-    {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2},
-    {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1},
-    {false, {AMDGPU::POS_X_FLOAT}, 1},
-    {false, {AMDGPU::POS_Y_FLOAT}, 1},
-    {false, {AMDGPU::POS_Z_FLOAT}, 1},
-    {false, {AMDGPU::POS_W_FLOAT}, 1},
-    {false, {AMDGPU::FRONT_FACE}, 1},
-    {false, {AMDGPU::ANCILLARY}, 1},
-    {false, {AMDGPU::SAMPLE_COVERAGE}, 1},
-    {false, {AMDGPU::POS_FIXED_PT}, 1}
-  };
-
-  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
-  // This pass is only needed for pixel shaders.
-  if (MFI->ShaderType != ShaderType::PIXEL) {
-    return false;
-  }
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  bool ForceEnable = true;
-
-  // First pass, mark the interpolation values that are used.
-  for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
-    for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
-                                                               RegIdx++) {
-      InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled ||
-                            !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]);
-      if (InterpUse[InterpIdx].Enabled &&
-          InterpIdx <= REQUIRED_VALUE_MAX_INDEX) {
-        ForceEnable = false;
-      }
-    }
-  }
-
-  // At least one interpolation mode must be enabled or else the GPU will hang.
-  if (ForceEnable) {
-    InterpUse[0].Enabled = true;
-  }
-
-  unsigned UsedVgprs = 0;
-
-  // Second pass, replace with VGPRs.
-  for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
-    if (!InterpUse[InterpIdx].Enabled) {
-      continue;
-    }
-    MFI->SPIPSInputAddr |= (1 << InterpIdx);
-
-    for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
-                                                  RegIdx++, UsedVgprs++) {
-      unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs);
-      unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-      MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg);
-      addLiveIn(&MF, MRI, NewReg, VirtReg);
-    }
-  }
-
-  return false;
-}
-
-void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF,
-                           MachineRegisterInfo & MRI,
-                           unsigned physReg, unsigned virtReg) {
-    const TargetInstrInfo * TII = TM.getInstrInfo();
-    if (!MRI.isLiveIn(physReg)) {
-      MRI.addLiveIn(physReg, virtReg);
-      MF->front().addLiveIn(physReg);
-      BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
-              TII->get(TargetOpcode::COPY), virtReg)
-                .addReg(physReg);
-    } else {
-      MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
-    }
-}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 0a0fbd9..93f8c38 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,10 +14,13 @@
 
 #include "SIISelLowering.h"
 #include "AMDIL.h"
+#include "AMDGPU.h"
 #include "AMDILIntrinsicInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -28,30 +31,41 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM),
     TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())),
     TRI(TM.getRegisterInfo()) {
-  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
-  addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
-  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
   addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
+  addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
+
+  addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
 
   addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
+
   addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
   addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+
   addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
+  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+
   addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
+  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
 
   computeRegisterProperties();
 
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
   setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
 
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-
-  // We need to custom lower loads from the USER_SGPR address space, so we can
-  // add the SGPRs as livein registers.
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::i64, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -59,6 +73,137 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::SELECT_CC);
 
   setTargetDAGCombine(ISD::SETCC);
+
+  setSchedulingPreference(Sched::Source);
+}
+
+SDValue SITargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  assert(CallConv == CallingConv::C);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  uint32_t Skipped = 0;
+
+  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+   
+    // First check if it's a PS input addr 
+    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
+
+      assert((PSInputNum <= 15) && "Too many PS inputs!");
+
+      if (!Arg.Used) {
+        // We can savely skip PS inputs
+        Skipped |= 1 << i;
+        ++PSInputNum;
+        continue;
+      }
+
+      Info->PSInputAddr |= 1 << PSInputNum++;
+    }
+
+    // Second split vertices into their elements
+    if (Arg.VT.isVector()) {
+      ISD::InputArg NewArg = Arg;
+      NewArg.Flags.setSplit();
+      NewArg.VT = Arg.VT.getVectorElementType();
+
+      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+      // three or five element vertex only needs three or five registers,
+      // NOT four or eigth.
+      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        Splits.push_back(NewArg);
+        NewArg.PartOffset += NewArg.VT.getStoreSize();
+      }
+
+    } else {
+      Splits.push_back(Arg);
+    }
+  }
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // At least one interpolation mode must be enabled or else the GPU will hang.
+  if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) {
+    Info->PSInputAddr |= 1;
+    CCInfo.AllocateReg(AMDGPU::VGPR0);
+    CCInfo.AllocateReg(AMDGPU::VGPR1);
+  }
+
+  AnalyzeFormalArguments(CCInfo, Splits);
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+
+    if (Skipped & (1 << i)) {
+      InVals.push_back(SDValue());
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+    MVT VT = VA.getLocVT();
+
+    if (VT == MVT::i64) {
+      // For now assume it is a pointer
+      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
+                                     &AMDGPU::SReg_64RegClass);
+      Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
+      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      continue;
+    }
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+    const ISD::InputArg &Arg = Ins[i];
+    if (Arg.VT.isVector()) {
+
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      for (unsigned j = 0; j != NumElements; ++j)
+        Regs.push_back(DAG.getUNDEF(VT));
+ 
+      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
+                                   Regs.data(), Regs.size()));
+      continue;
+    }
+
+    InVals.push_back(Val);
+  }
+  return Chain;
 }
 
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
@@ -70,15 +215,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
-  case AMDGPU::SHADER_TYPE:
-    BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
-                                        MI->getOperand(0).getImm();
-    MI->eraseFromParent();
-    break;
-
-  case AMDGPU::SI_INTERP:
-    LowerSI_INTERP(MI, *BB, I, MRI);
-    break;
   case AMDGPU::SI_WQM:
     LowerSI_WQM(MI, *BB, I, MRI);
     break;
@@ -94,41 +230,14 @@ void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
   MI->eraseFromParent();
 }
 
-void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
-    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
-  unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-  unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
-  MachineOperand dst = MI->getOperand(0);
-  MachineOperand iReg = MI->getOperand(1);
-  MachineOperand jReg = MI->getOperand(2);
-  MachineOperand attr_chan = MI->getOperand(3);
-  MachineOperand attr = MI->getOperand(4);
-  MachineOperand params = MI->getOperand(5);
-
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
-          .addOperand(params);
-
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
-          .addOperand(iReg)
-          .addOperand(attr_chan)
-          .addOperand(attr)
-          .addReg(M0);
-
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
-          .addOperand(dst)
-          .addReg(tmp)
-          .addOperand(jReg)
-          .addOperand(attr_chan)
-          .addOperand(attr)
-          .addReg(M0);
-
-  MI->eraseFromParent();
-}
-
 EVT SITargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i1;
 }
 
+MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+  return MVT::i32;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
@@ -137,20 +246,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  case ISD::LOAD: return LowerLOAD(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    EVT VT = Op.getValueType();
-    switch (IntrinsicID) {
-    case AMDGPUIntrinsic::SI_vs_load_buffer_index:
-      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
-                                  AMDGPU::VGPR0, VT);
-    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-    }
-    break;
-  }
   }
   return SDValue();
 }
@@ -249,47 +345,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
-SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
-
-  assert(Ptr);
-
-  unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
-
-  // We only need to lower USER_SGPR address space loads
-  if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
-    return SDValue();
-  }
-
-  // Loads from the USER_SGPR address space can only have constant value
-  // pointers.
-  ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
-  assert(BasePtr);
-
-  unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
-  const TargetRegisterClass * dstClass;
-  switch (TypeDwordWidth) {
-    default:
-      assert(!"USER_SGPR value size not implemented");
-      return SDValue();
-    case 1:
-      dstClass = &AMDGPU::SReg_32RegClass;
-      break;
-    case 2:
-      dstClass = &AMDGPU::SReg_64RegClass;
-      break;
-  }
-  uint64_t Index = BasePtr->getZExtValue();
-  assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
-  unsigned SGPRIndex = Index / TypeDwordWidth;
-  unsigned Reg = dstClass->getRegister(SGPRIndex);
-
-  DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
-                                                         VT));
-  return SDValue();
-}
-
 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 737162f..d656225 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -24,14 +24,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
   const SIInstrInfo * TII;
   const TargetRegisterInfo * TRI;
 
-  void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
-              MachineBasicBlock::iterator I, unsigned Opocde) const;
-  void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
-              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
   void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
               MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
 
-  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
@@ -43,9 +38,17 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
 public:
   SITargetLowering(TargetMachine &tm);
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const;
+
   virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                               MachineBasicBlock * BB) const;
   virtual EVT getSetCCResultType(EVT VT) const;
+  virtual MVT getScalarShiftAmountTy(EVT VT) const;
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 24fc929..98bd3db 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -88,6 +88,9 @@ private:
                   MachineBasicBlock::iterator I,
                   const Counters &Counts);
 
+  /// \brief Do we need def2def checks?
+  bool unorderedDefines(MachineInstr &MI);
+
   /// \brief Resolve all operand dependencies to counter requirements
   Counters handleOperands(MachineInstr &MI);
 
@@ -125,7 +128,7 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 
   // Only consider stores or EXP for EXP_CNT
   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
-      (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
+      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
 
   // LGKM may uses larger values
   if (TSFlags & SIInstrFlags::LGKM_CNT) {
@@ -311,8 +314,10 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
     RegInterval Interval = getRegInterval(Op);
     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 
-      if (Op.isDef())
+      if (Op.isDef()) {
         increaseCounters(Result, UsedRegs[j]);
+        increaseCounters(Result, DefinedRegs[j]);
+      }
 
       if (Op.isUse())
         increaseCounters(Result, DefinedRegs[j]);
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index fe417d6..3891ddb 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -129,12 +129,12 @@ class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
             list<dag> pattern> : Enc32<outs, ins, asm, pattern> {
 
   bits<7> SDST;
-  bits<6> SBASE;
+  bits<7> SBASE;
   bits<8> OFFSET;
   
   let Inst{7-0} = OFFSET;
   let Inst{8} = imm;
-  let Inst{14-9} = SBASE;
+  let Inst{14-9} = SBASE{6-1};
   let Inst{21-15} = SDST;
   let Inst{26-22} = op;
   let Inst{31-27} = 0x18; //encoding
@@ -292,7 +292,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   bits<1> ADDR64;
   bits<1> LDS;
   bits<8> VADDR;
-  bits<5> SRSRC;
+  bits<7> SRSRC;
   bits<1> SLC;
   bits<1> TFE;
   bits<8> SOFFSET;
@@ -307,7 +307,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{31-26} = 0x38; //encoding
   let Inst{39-32} = VADDR;
   let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC;
+  let Inst{52-48} = SRSRC{6-2};
   let Inst{54} = SLC;
   let Inst{55} = TFE;
   let Inst{63-56} = SOFFSET;
@@ -330,7 +330,7 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
   bits<4> DFMT;
   bits<3> NFMT;
   bits<8> VADDR;
-  bits<5> SRSRC;
+  bits<7> SRSRC;
   bits<1> SLC;
   bits<1> TFE;
   bits<8> SOFFSET;
@@ -346,7 +346,7 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = VADDR;
   let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC;
+  let Inst{52-48} = SRSRC{6-2};
   let Inst{54} = SLC;
   let Inst{55} = TFE;
   let Inst{63-56} = SOFFSET;
@@ -370,8 +370,8 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   bits<1> LWE;
   bits<1> SLC;
   bits<8> VADDR;
-  bits<5> SRSRC;
-  bits<5> SSAMP; 
+  bits<7> SRSRC;
+  bits<7> SSAMP; 
 
   let Inst{11-8} = DMASK;
   let Inst{12} = UNORM;
@@ -385,8 +385,8 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{31-26} = 0x3c;
   let Inst{39-32} = VADDR;
   let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC;
-  let Inst{57-53} = SSAMP;
+  let Inst{52-48} = SRSRC{6-2};
+  let Inst{57-53} = SSAMP{6-2};
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index d9dbd6a..de2373b 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -33,36 +33,108 @@ const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
 
 void
 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const {
+                         MachineBasicBlock::iterator MI, DebugLoc DL,
+                         unsigned DestReg, unsigned SrcReg,
+                         bool KillSrc) const {
+
   // If we are trying to copy to or from SCC, there is a bug somewhere else in
   // the backend.  While it may be theoretically possible to do this, it should
   // never be necessary.
   assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
 
-  if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
-    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_64RegClass.contains(SrcReg));
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub0))
-            .addReg(RI.getSubReg(SrcReg, AMDGPU::sub0), getKillRegState(KillSrc))
-            .addReg(DestReg, RegState::Define | RegState::Implicit);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub1))
-            .addReg(RI.getSubReg(SrcReg, AMDGPU::sub1), getKillRegState(KillSrc));
+  const int16_t Sub0_15[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+  };
+
+  const int16_t Sub0_7[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+  };
+
+  const int16_t Sub0_3[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+  };
+
+  const int16_t Sub0_1[] = {
+    AMDGPU::sub0, AMDGPU::sub1, 0
+  };
+
+  unsigned Opcode;
+  const int16_t *SubIndices;
+
+  if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
   } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_15;
+
   } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
-           AMDGPU::SReg_32RegClass.contains(SrcReg));
+	   AMDGPU::SReg_32RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_64RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_1;
+
+  } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_15;
+
   } else {
-    assert(AMDGPU::SReg_32RegClass.contains(DestReg));
-    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc));
+    llvm_unreachable("Can't copy register!");
+  }
+
+  while (unsigned SubIdx = *SubIndices++) {
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, SubIdx));
+
+    Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+
+    if (*SubIndices)
+      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
   }
 }
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index d6c3f06..2f10c38 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -53,16 +53,6 @@ def SIOperand {
   int VCC = 0x6A;
 }
 
-class GPR4Align <RegisterClass rc> : Operand <vAny> {
-  let EncoderMethod = "GPR4AlignEncode";
-  let MIOperandInfo = (ops rc:$reg); 
-}
-
-class GPR2Align <RegisterClass rc> : Operand <iPTR> {
-  let EncoderMethod = "GPR2AlignEncode";
-  let MIOperandInfo = (ops rc:$reg);
-}
-
 include "SIInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
@@ -125,16 +115,17 @@ class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
   opName#" $dst, $src0", pattern
 >;
 
-multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> {
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
+                        RegisterClass dstClass> {
   def _IMM : SMRD <
     op, 1, (outs dstClass:$dst),
-    (ins GPR2Align<SReg_64>:$sbase, i32imm:$offset),
+    (ins baseClass:$sbase, i32imm:$offset),
     asm#" $dst, $sbase, $offset", []
   >;
 
   def _SGPR : SMRD <
     op, 0, (outs dstClass:$dst),
-    (ins GPR2Align<SReg_64>:$sbase, SReg_32:$soff),
+    (ins baseClass:$sbase, SReg_32:$soff),
     asm#" $dst, $sbase, $soff", []
   >;
 }
@@ -276,7 +267,7 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
   (outs),
   (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
    i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
-   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+   SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
      #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
   []> {
@@ -288,7 +279,7 @@ class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF
   op,
   (outs regClass:$dst),
   (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
+       i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
        i1imm:$tfe, SSrc_32:$soffset),
   asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, "
      #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
@@ -301,7 +292,7 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
   op,
   (outs regClass:$dst),
   (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
+       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
        i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
      #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
@@ -315,7 +306,7 @@ class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
   (outs VReg_128:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
        i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr,
-       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
+       SReg_256:$srsrc, SReg_128:$ssamp),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
   []> {
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index af116f0..05b04a9 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -403,9 +403,9 @@ def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT
 //def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
 //def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
 //def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
-//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
-//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
-//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
+def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
+def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
+def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
 //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
 //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
 //def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
@@ -458,17 +458,31 @@ def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORM
 
 let mayLoad = 1 in {
 
-defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>;
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
 
-//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>;
-//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
-//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
-//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
-//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
-//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
-//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
 
 } // mayLoad = 1
 
@@ -840,7 +854,9 @@ defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
 defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
 defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+  [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))]
+>;
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
 
 let isCommutable = 1 in {
@@ -1044,13 +1060,6 @@ def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
 
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
-def SET_M0 : InstSI <
-  (outs SReg_32:$dst),
-  (ins i32imm:$src0),
-  "SET_M0 $dst, $src0",
-  [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
->;
-
 def LOAD_CONST : AMDGPUShaderInst <
   (outs GPRF32:$dst),
   (ins i32imm:$src),
@@ -1060,13 +1069,6 @@ def LOAD_CONST : AMDGPUShaderInst <
 
 let usesCustomInserter = 1 in {
 
-def SI_INTERP : InstSI <
-  (outs VReg_32:$dst),
-  (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
-  "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params",
-  []
->;
-
 def SI_WQM : InstSI <
   (outs),
   (ins),
@@ -1147,6 +1149,31 @@ def SI_KILL : InstSI <
 } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
   // Uses = [EXEC], Defs = [EXEC]
 
+let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
+
+def SI_INDIRECT_SRC : InstSI <
+  (outs VReg_32:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+  "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
+  []
+>;
+
+class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
+  (outs rc:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
+  "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
+  []
+> {
+  let Constraints = "$src = $dst";
+}
+
+def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+
+} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+
 } // end IsCodeGenOnly, isPseudo
 
 def : Pat<
@@ -1255,22 +1282,83 @@ defm : SamplePatterns<VReg_128, v4i32>;
 defm : SamplePatterns<VReg_256, v8i32>;
 defm : SamplePatterns<VReg_512, v16i32>;
 
-def : Extract_Element <f32, v4f32, VReg_128, 0, sub0>;
-def : Extract_Element <f32, v4f32, VReg_128, 1, sub1>;
-def : Extract_Element <f32, v4f32, VReg_128, 2, sub2>;
-def : Extract_Element <f32, v4f32, VReg_128, 3, sub3>;
+/********** ============================================ **********/
+/********** Extraction, Insertion, Building and Casting  **********/
+/********** ============================================ **********/
+
+foreach Index = 0-2 in {
+  def Extract_Element_v2i32_#Index : Extract_Element <
+    i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2i32_#Index : Insert_Element <
+    i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v2f32_#Index : Extract_Element <
+    f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2f32_#Index : Insert_Element <
+    f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-3 in {
+  def Extract_Element_v4i32_#Index : Extract_Element <
+    i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4i32_#Index : Insert_Element <
+    i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
 
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sub0>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sub1>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sub2>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sub3>;
+  def Extract_Element_v4f32_#Index : Extract_Element <
+    f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4f32_#Index : Insert_Element <
+    f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-7 in {
+  def Extract_Element_v8i32_#Index : Extract_Element <
+    i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8i32_#Index : Insert_Element <
+    i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v8f32_#Index : Extract_Element <
+    f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8f32_#Index : Insert_Element <
+    f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-15 in {
+  def Extract_Element_v16i32_#Index : Extract_Element <
+    i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16i32_#Index : Insert_Element <
+    i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v16f32_#Index : Extract_Element <
+    f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16f32_#Index : Insert_Element <
+    f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
 
 def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>;
 def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>;
-def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
-def : Vector_Build <v4i32, VReg_128, i32, VReg_32>;
+def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>;
+def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>;
+def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>;
 def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>;
+def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>;
 def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>;
+def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>;
 
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <i32, f32, VReg_32>;
@@ -1305,11 +1393,6 @@ def : Pat <
 /********** ================== **********/
 
 def : Pat <
-  (i1 imm:$imm),
-  (S_MOV_B64 imm:$imm)
->;
-
-def : Pat <
   (i32 imm:$imm),
   (V_MOV_B32_e32 imm:$imm)
 >;
@@ -1320,13 +1403,8 @@ def : Pat <
 >;
 
 def : Pat <
-  (i32 imm:$imm),
-  (S_MOV_B32 imm:$imm)
->;
-
-def : Pat <
-  (f32 fpimm:$imm),
-  (S_MOV_B32 fpimm:$imm)
+  (i1 imm:$imm),
+  (S_MOV_B64 imm:$imm)
 >;
 
 def : Pat <
@@ -1347,58 +1425,16 @@ def : Pat <
 /********** ===================== **********/
 
 def : Pat <
-  (int_SI_fs_interp_constant imm:$attr_chan, imm:$attr, SReg_32:$params),
-  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr,
-                    (S_MOV_B32 SReg_32:$params))
->;
-
-def : Pat <
-  (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
-  (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
-             imm:$attr, SReg_32:$params)
+  (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params),
+  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params)
 >;
 
 def : Pat <
-  (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
-  (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan,
-             imm:$attr, SReg_32:$params)
->;
-
-def : Pat <
-  (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params),
-  (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan,
-             imm:$attr, SReg_32:$params)
->;
-
-def : Pat <
-  (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
-  (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan,
-             imm:$attr, SReg_32:$params)
->;
-
-def : Pat <
-  (int_SI_fs_read_face),
-  (f32 FRONT_FACE)
->;
-
-def : Pat <
-  (int_SI_fs_read_pos 0),
-  (f32 POS_X_FLOAT)
->;
-
-def : Pat <
-  (int_SI_fs_read_pos 1),
-  (f32 POS_Y_FLOAT)
->;
-
-def : Pat <
-  (int_SI_fs_read_pos 2),
-  (f32 POS_Z_FLOAT)
->;
-
-def : Pat <
-  (int_SI_fs_read_pos 3),
-  (f32 POS_W_FLOAT)
+  (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij),
+  (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0),
+                                    imm:$attr_chan, imm:$attr, M0Reg:$params),
+                   (EXTRACT_SUBREG VReg_64:$ij, sub1),
+                   imm:$attr_chan, imm:$attr, M0Reg:$params)
 >;
 
 /********** ================== **********/
@@ -1455,6 +1491,24 @@ def : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
 >;
 
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+  (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset)
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (int_SI_load_const SReg_128:$sbase, imm:$offset),
+  (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset))
+>;
+
+// 3. Offset in an 32Bit VGPR
+def : Pat <
+  (int_SI_load_const SReg_128:$sbase, VReg_32:$voff),
+  (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0)
+>;
+
 /********** ================== **********/
 /**********   VOP3 Patterns    **********/
 /********** ================== **********/
@@ -1489,7 +1543,51 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
 
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+
+/********** ====================== **********/
+/**********   Indirect adressing   **********/
+/********** ====================== **********/
+
+multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt,
+                                SI_INDIRECT_DST IndDst> {
+  // 1. Extract with offset
+  def : Pat<
+    (vector_extract (vt rc:$vec),
+      (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
+    ),
+    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off))
+  >;
+
+  // 2. Extract without offset
+  def : Pat<
+    (vector_extract (vt rc:$vec),
+      (i64 (zext (i32 VReg_32:$idx)))
+    ),
+    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0))
+  >;
+
+  // 3. Insert with offset
+  def : Pat<
+    (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
+      (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
+    ),
+    (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val))
+  >;
+
+  // 4. Insert without offset
+  def : Pat<
+    (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
+      (i64 (zext (i32 VReg_32:$idx)))
+    ),
+    (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val))
+  >;
+}
+
+defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
 
 } // End isSI predicate
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 611b9c4..33bb815 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -16,13 +16,11 @@ let TargetPrefix = "SI", isTarget = 1 in {
 
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
-  /* XXX: We may need a seperate intrinsic here for loading integer values */
-  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
-  def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
   def int_SI_wqm : Intrinsic <[], [], []>;
 
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>;
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>;
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
@@ -30,17 +28,8 @@ let TargetPrefix = "SI", isTarget = 1 in {
 
   /* Interpolation Intrinsics */
 
-  def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>;
-  class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
-
-  def int_SI_fs_interp_linear_center : Interp;
-  def int_SI_fs_interp_linear_centroid : Interp;
-  def int_SI_fs_interp_persp_center : Interp;
-  def int_SI_fs_interp_persp_centroid : Interp;
-  def int_SI_fs_interp_constant : Interp;
-
-  def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>;
-  def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
+  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrReadMem]>;
 
   /* Control flow Intrinsics */
 
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index b215aa2..9a027e7 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -66,6 +66,7 @@ private:
   static const unsigned SkipThreshold = 12;
 
   static char ID;
+  const TargetRegisterInfo *TRI;
   const TargetInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@@ -84,9 +85,14 @@ private:
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
+  void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
+  void IndirectSrc(MachineInstr &MI);
+  void IndirectDst(MachineInstr &MI);
+
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
+    MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
+    TII(tm.getInstrInfo()) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -302,6 +308,104 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I = MI;
+
+  unsigned Save = MI.getOperand(1).getReg();
+  unsigned Idx = MI.getOperand(3).getReg();
+
+  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(Idx);
+    MBB.insert(I, MovRel);
+    MI.eraseFromParent();
+    return;
+  }
+
+  assert(AMDGPU::SReg_64RegClass.contains(Save));
+  assert(AMDGPU::VReg_32RegClass.contains(Idx));
+
+  // Save the EXEC mask
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+          .addReg(AMDGPU::EXEC);
+
+  // Read the next variant into VCC (lower 32 bits) <- also loop target
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
+          .addReg(Idx);
+
+  // Move index from VCC into M0
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+          .addReg(AMDGPU::VCC);
+
+  // Compare the just read M0 value to all possible Idx values
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+          .addReg(AMDGPU::M0)
+          .addReg(Idx);
+
+  // Update EXEC, save the original EXEC value to VCC
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+          .addReg(AMDGPU::VCC);
+
+  // Do the actual move
+  MBB.insert(I, MovRel);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(AMDGPU::VCC);
+
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addImm(-7)
+          .addReg(AMDGPU::EXEC);
+
+  // Restore EXEC
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+          .addReg(Save);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vec = MI.getOperand(2).getReg();
+  unsigned Off = MI.getOperand(4).getImm();
+
+  MachineInstr *MovRel = 
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+            .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Vec, RegState::Implicit);
+
+  LoadM0(MI, MovRel);
+}
+
+void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Off = MI.getOperand(4).getImm();
+  unsigned Val = MI.getOperand(5).getReg();
+
+  MachineInstr *MovRel = 
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
+            .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
+            .addReg(Val)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Dst, RegState::Implicit);
+
+  LoadM0(MI, MovRel);
+}
+
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
 
   bool HaveKill = false;
@@ -363,6 +467,17 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::S_BRANCH:
           Branch(MI);
           break;
+
+        case AMDGPU::SI_INDIRECT_SRC:
+          IndirectSrc(MI);
+          break;
+
+        case AMDGPU::SI_INDIRECT_DST_V2:
+        case AMDGPU::SI_INDIRECT_DST_V4:
+        case AMDGPU::SI_INDIRECT_DST_V8:
+        case AMDGPU::SI_INDIRECT_DST_V16:
+          IndirectDst(MI);
+          break;
       }
     }
   }
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index 7e59b42..1a4e4cb 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,11 +10,25 @@
 
 
 #include "SIMachineFunctionInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
+const char *SIMachineFunctionInfo::ShaderTypeAttribute = "ShaderType";
+
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : MachineFunctionInfo(),
-    SPIPSInputAddr(0),
-    ShaderType(0)
-  { }
+    ShaderType(0),
+    PSInputAddr(0) {
+
+  AttributeSet Set = MF.getFunction()->getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
+                                 ShaderTypeAttribute);
+
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    if (Str.getAsInteger(0, ShaderType))
+      llvm_unreachable("Can't parse shader type!");
+  }
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 47271f5..91a809b 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -23,9 +23,11 @@ namespace llvm {
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public MachineFunctionInfo {
 public:
+  static const char *ShaderTypeAttribute;
+
   SIMachineFunctionInfo(const MachineFunction &MF);
-  unsigned SPIPSInputAddr;
   unsigned ShaderType;
+  unsigned PSInputAddr;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 9e04e24..4f14931 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -34,32 +34,6 @@ foreach Index = 0-255 in {
   }
 }
 
-// virtual Interpolation registers
-def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
-def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
-def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
-def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">;
-def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">;
-def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">;
-def PERSP_I_W : SIReg <"PERSP_I_W">;
-def PERSP_J_W : SIReg <"PERSP_J_W">;
-def PERSP_1_W : SIReg <"PERSP_1_W">;
-def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">;
-def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">;
-def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">;
-def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">;
-def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">;
-def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">;
-def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">;
-def POS_X_FLOAT : SIReg <"POS_X_FLOAT">;
-def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">;
-def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">;
-def POS_W_FLOAT : SIReg <"POS_W_FLOAT">;
-def FRONT_FACE : SIReg <"FRONT_FACE">;
-def ANCILLARY : SIReg <"ANCILLARY">;
-def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
-def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
-
 //===----------------------------------------------------------------------===//
 //  Groupings using register classes and tuples
 //===----------------------------------------------------------------------===//
@@ -177,22 +151,22 @@ def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64,
   (add SGPR_64, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>;
 
-def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
 
-def SReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>;
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_32 : RegisterClass<"AMDGPU", [f32, i32, v1i32], 32, (add VGPR_32)>;
+def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
 
-def VReg_64 : RegisterClass<"AMDGPU", [i64, v2i32], 64, (add VGPR_64)>;
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
 
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
 
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
 //===----------------------------------------------------------------------===//
 //  [SV]Src_* register classes, can have either an immediate or an register
@@ -200,28 +174,9 @@ def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
 
 def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
 
-def SSrc_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SReg_64)>;
-
-def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-  (add VReg_32, SReg_32,
-    PERSP_SAMPLE_I, PERSP_SAMPLE_J,
-    PERSP_CENTER_I, PERSP_CENTER_J,
-    PERSP_CENTROID_I, PERSP_CENTROID_J,
-    PERSP_I_W, PERSP_J_W, PERSP_1_W,
-    LINEAR_SAMPLE_I, LINEAR_SAMPLE_J,
-    LINEAR_CENTER_I, LINEAR_CENTER_J,
-    LINEAR_CENTROID_I, LINEAR_CENTROID_J,
-    LINE_STIPPLE_TEX_COORD,
-    POS_X_FLOAT,
-    POS_Y_FLOAT,
-    POS_Z_FLOAT,
-    POS_W_FLOAT,
-    FRONT_FACE,
-    ANCILLARY,
-    SAMPLE_COVERAGE,
-    POS_FIXED_PT
-  )
->;
+def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
+
+def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
 
-def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add VReg_64, SReg_64)>;
+def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 138b92d..28ac02a 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -955,9 +955,7 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
   // Get the condition flag.
   SDValue CompareFlag;
   if (LHS.getValueType() == MVT::i32) {
-    std::vector<EVT> VTs;
-    VTs.push_back(MVT::i32);
-    VTs.push_back(MVT::Glue);
+    EVT VTs[] = { MVT::i32, MVT::Glue };
     SDValue Ops[2] = { LHS, RHS };
     CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
     if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
@@ -986,9 +984,8 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
 
   SDValue CompareFlag;
   if (LHS.getValueType() == MVT::i32) {
-    std::vector<EVT> VTs;
-    VTs.push_back(LHS.getValueType());   // subcc returns a value
-    VTs.push_back(MVT::Glue);
+    // subcc returns a value
+    EVT VTs[] = { LHS.getValueType(), MVT::Glue };
     SDValue Ops[2] = { LHS, RHS };
     CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
     Opc = SPISD::SELECT_ICC;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 357879b..b53a1ed 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -40,7 +40,8 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = NULL) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 3a9ace4..ee88ce7 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -24,6 +24,8 @@ void TargetLibraryInfo::anchor() { }
 
 const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
   {
+    "_IO_getc",
+    "_IO_putc",
     "_ZdaPv",
     "_ZdlPv",
     "_Znaj",
@@ -38,8 +40,14 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
     "__cxa_guard_release",
+    "__isoc99_scanf",
+    "__isoc99_sscanf",
     "__memcpy_chk",
+    "__strdup",
+    "__strndup",
+    "__strtok_r",
     "abs",
+    "access",
     "acos",
     "acosf",
     "acosh",
@@ -61,6 +69,13 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "atanhf",
     "atanhl",
     "atanl",
+    "atof",
+    "atoi",
+    "atol",
+    "atoll",
+    "bcmp",
+    "bcopy",
+    "bzero",
     "calloc",
     "cbrt",
     "cbrtf",
@@ -68,6 +83,10 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "ceil",
     "ceilf",
     "ceill",
+    "chmod",
+    "chown",
+    "clearerr",
+    "closedir",
     "copysign",
     "copysignf",
     "copysignl",
@@ -77,6 +96,7 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "coshf",
     "coshl",
     "cosl",
+    "ctermid",
     "exp",
     "exp10",
     "exp10f",
@@ -92,25 +112,66 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "fabs",
     "fabsf",
     "fabsl",
+    "fclose",
+    "fdopen",
+    "feof",
+    "ferror",
+    "fflush",
     "ffs",
     "ffsl",
     "ffsll",
+    "fgetc",
+    "fgetpos",
+    "fgets",
+    "fileno",
     "fiprintf",
+    "flockfile",
     "floor",
     "floorf",
     "floorl",
     "fmod",
     "fmodf",
     "fmodl",
+    "fopen",
+    "fopen64",
     "fprintf",
     "fputc",
     "fputs",
+    "fread",
     "free",
+    "frexp",
+    "frexpf",
+    "frexpl",
+    "fscanf",
+    "fseek",
+    "fseeko",
+    "fseeko64",
+    "fsetpos",
+    "fstat",
+    "fstat64",
+    "fstatvfs",
+    "fstatvfs64",
+    "ftell",
+    "ftello",
+    "ftello64",
+    "ftrylockfile",
+    "funlockfile",
     "fwrite",
+    "getc",
+    "getc_unlocked",
+    "getchar",
+    "getenv",
+    "getitimer",
+    "getlogin_r",
+    "getpwnam",
+    "gets",
+    "htonl",
+    "htons",
     "iprintf",
     "isascii",
     "isdigit",
     "labs",
+    "lchown",
     "llabs",
     "log",
     "log10",
@@ -127,31 +188,64 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "logbl",
     "logf",
     "logl",
+    "lstat",
+    "lstat64",
     "malloc",
+    "memalign",
+    "memccpy",
     "memchr",
     "memcmp",
     "memcpy",
     "memmove",
+    "memrchr",
     "memset",
     "memset_pattern16",
+    "mkdir",
+    "mktime",
+    "modf",
+    "modff",
+    "modfl",
     "nearbyint",
     "nearbyintf",
     "nearbyintl",
+    "ntohl",
+    "ntohs",
+    "open",
+    "open64",
+    "opendir",
+    "pclose",
+    "perror",
+    "popen",
     "posix_memalign",
     "pow",
     "powf",
     "powl",
+    "pread",
     "printf",
+    "putc",
     "putchar",
     "puts",
+    "pwrite",
+    "qsort",
+    "read",
+    "readlink",
     "realloc",
     "reallocf",
+    "realpath",
+    "remove",
+    "rename",
+    "rewind",
     "rint",
     "rintf",
     "rintl",
+    "rmdir",
     "round",
     "roundf",
     "roundl",
+    "scanf",
+    "setbuf",
+    "setitimer",
+    "setvbuf",
     "sin",
     "sinf",
     "sinh",
@@ -159,18 +253,28 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "sinhl",
     "sinl",
     "siprintf",
+    "snprintf",
     "sprintf",
     "sqrt",
     "sqrtf",
     "sqrtl",
+    "sscanf",
+    "stat",
+    "stat64",
+    "statvfs",
+    "statvfs64",
     "stpcpy",
+    "stpncpy",
+    "strcasecmp",
     "strcat",
     "strchr",
     "strcmp",
+    "strcoll",
     "strcpy",
     "strcspn",
     "strdup",
     "strlen",
+    "strncasecmp",
     "strncat",
     "strncmp",
     "strncpy",
@@ -182,22 +286,43 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "strstr",
     "strtod",
     "strtof",
+    "strtok",
+    "strtok_r",
     "strtol",
     "strtold",
     "strtoll",
     "strtoul",
     "strtoull",
+    "strxfrm",
+    "system",
     "tan",
     "tanf",
     "tanh",
     "tanhf",
     "tanhl",
     "tanl",
+    "times",
+    "tmpfile",
+    "tmpfile64",
     "toascii",
     "trunc",
     "truncf",
     "truncl",
-    "valloc"
+    "uname",
+    "ungetc",
+    "unlink",
+    "unsetenv",
+    "utime",
+    "utimes",
+    "valloc",
+    "vfprintf",
+    "vfscanf",
+    "vprintf",
+    "vscanf",
+    "vsnprintf",
+    "vsprintf",
+    "vsscanf",
+    "write"
   };
 
 /// initialize - Initialize the set of available library functions based on the
@@ -259,7 +384,9 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::fabsl);
     TLI.setUnavailable(LibFunc::floorl);
     TLI.setUnavailable(LibFunc::fmodl);
+    TLI.setUnavailable(LibFunc::frexpl);
     TLI.setUnavailable(LibFunc::logl);
+    TLI.setUnavailable(LibFunc::modfl);
     TLI.setUnavailable(LibFunc::powl);
     TLI.setUnavailable(LibFunc::sinl);
     TLI.setUnavailable(LibFunc::sinhl);
@@ -336,16 +463,67 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc::tanhf);
     }
 
-    // Win32 does *not* provide stpcpy.  It is provided on POSIX systems:
-    // http://pubs.opengroup.org/onlinepubs/9699919799/functions/stpcpy.html
-    TLI.setUnavailable(LibFunc::stpcpy);
-
-    // Win32 does *not* provide ffs.  It is provided on POSIX systems:
-    // http://pubs.opengroup.org/onlinepubs/009695399/functions/ffs.html
+    // Win32 does *not* provide provide these functions, but they are
+    // generally available on POSIX-compliant systems:
+    TLI.setUnavailable(LibFunc::access);
+    TLI.setUnavailable(LibFunc::bcmp);
+    TLI.setUnavailable(LibFunc::bcopy);
+    TLI.setUnavailable(LibFunc::bzero);
+    TLI.setUnavailable(LibFunc::chmod);
+    TLI.setUnavailable(LibFunc::chown);
+    TLI.setUnavailable(LibFunc::closedir);
+    TLI.setUnavailable(LibFunc::ctermid);
+    TLI.setUnavailable(LibFunc::fdopen);
     TLI.setUnavailable(LibFunc::ffs);
+    TLI.setUnavailable(LibFunc::fileno);
+    TLI.setUnavailable(LibFunc::flockfile);
+    TLI.setUnavailable(LibFunc::fseeko);
+    TLI.setUnavailable(LibFunc::fstat);
+    TLI.setUnavailable(LibFunc::fstatvfs);
+    TLI.setUnavailable(LibFunc::ftello);
+    TLI.setUnavailable(LibFunc::ftrylockfile);
+    TLI.setUnavailable(LibFunc::funlockfile);
+    TLI.setUnavailable(LibFunc::getc_unlocked);
+    TLI.setUnavailable(LibFunc::getitimer);
+    TLI.setUnavailable(LibFunc::getlogin_r);
+    TLI.setUnavailable(LibFunc::getpwnam);
+    TLI.setUnavailable(LibFunc::htonl);
+    TLI.setUnavailable(LibFunc::htons);
+    TLI.setUnavailable(LibFunc::lchown);
+    TLI.setUnavailable(LibFunc::lstat);
+    TLI.setUnavailable(LibFunc::memccpy);
+    TLI.setUnavailable(LibFunc::mkdir);
+    TLI.setUnavailable(LibFunc::ntohl);
+    TLI.setUnavailable(LibFunc::ntohs);
+    TLI.setUnavailable(LibFunc::open);
+    TLI.setUnavailable(LibFunc::opendir);
+    TLI.setUnavailable(LibFunc::pclose);
+    TLI.setUnavailable(LibFunc::popen);
+    TLI.setUnavailable(LibFunc::pread);
+    TLI.setUnavailable(LibFunc::pwrite);
+    TLI.setUnavailable(LibFunc::read);
+    TLI.setUnavailable(LibFunc::readlink);
+    TLI.setUnavailable(LibFunc::realpath);
+    TLI.setUnavailable(LibFunc::rmdir);
+    TLI.setUnavailable(LibFunc::setitimer);
+    TLI.setUnavailable(LibFunc::stat);
+    TLI.setUnavailable(LibFunc::statvfs);
+    TLI.setUnavailable(LibFunc::stpcpy);
+    TLI.setUnavailable(LibFunc::stpncpy);
+    TLI.setUnavailable(LibFunc::strcasecmp);
+    TLI.setUnavailable(LibFunc::strncasecmp);
+    TLI.setUnavailable(LibFunc::times);
+    TLI.setUnavailable(LibFunc::uname);
+    TLI.setUnavailable(LibFunc::unlink);
+    TLI.setUnavailable(LibFunc::unsetenv);
+    TLI.setUnavailable(LibFunc::utime);
+    TLI.setUnavailable(LibFunc::utimes);
+    TLI.setUnavailable(LibFunc::write);
 
-    // Win32 does *not* provide llabs.  It is defined in ISO/IEC 9899:1999,
-    // but Visual C++ does not support it.
+    // Win32 does *not* provide provide these functions, but they are
+    // specified by C99:
+    TLI.setUnavailable(LibFunc::atoll);
+    TLI.setUnavailable(LibFunc::frexpf);
     TLI.setUnavailable(LibFunc::llabs);
   }
 
@@ -375,6 +553,27 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
   default:
     TLI.setUnavailable(LibFunc::ffsll);
   }
+
+  // The following functions are available on at least Linux:
+  if (T.getOS() != Triple::Linux) {
+    TLI.setUnavailable(LibFunc::dunder_strdup);
+    TLI.setUnavailable(LibFunc::dunder_strtok_r);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_sscanf);
+    TLI.setUnavailable(LibFunc::under_IO_getc);
+    TLI.setUnavailable(LibFunc::under_IO_putc);
+    TLI.setUnavailable(LibFunc::memalign);
+    TLI.setUnavailable(LibFunc::fopen64);
+    TLI.setUnavailable(LibFunc::fseeko64);
+    TLI.setUnavailable(LibFunc::fstat64);
+    TLI.setUnavailable(LibFunc::fstatvfs64);
+    TLI.setUnavailable(LibFunc::ftello64);
+    TLI.setUnavailable(LibFunc::lstat64);
+    TLI.setUnavailable(LibFunc::open64);
+    TLI.setUnavailable(LibFunc::stat64);
+    TLI.setUnavailable(LibFunc::statvfs64);
+    TLI.setUnavailable(LibFunc::tmpfile64);
+  }
 }
 
 
@@ -398,11 +597,40 @@ TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
   CustomNames = TLI.CustomNames;
 }
 
+namespace {
+struct StringComparator {
+  /// Compare two strings and return true if LHS is lexicographically less than
+  /// RHS. Requires that RHS doesn't contain any zero bytes.
+  bool operator()(const char *LHS, StringRef RHS) const {
+    // Compare prefixes with strncmp. If prefixes match we know that LHS is
+    // greater or equal to RHS as RHS can't contain any '\0'.
+    return std::strncmp(LHS, RHS.data(), RHS.size()) < 0;
+  }
+
+  // Provided for compatibility with MSVC's debug mode.
+  bool operator()(StringRef LHS, const char *RHS) const { return LHS < RHS; }
+  bool operator()(StringRef LHS, StringRef RHS) const { return LHS < RHS; }
+  bool operator()(const char *LHS, const char *RHS) const {
+    return std::strcmp(LHS, RHS) < 0;
+  }
+};
+}
+
 bool TargetLibraryInfo::getLibFunc(StringRef funcName,
                                    LibFunc::Func &F) const {
   const char **Start = &StandardNames[0];
   const char **End = &StandardNames[LibFunc::NumLibFuncs];
-  const char **I = std::lower_bound(Start, End, funcName);
+
+  // Filter out empty names and names containing null bytes, those can't be in
+  // our table.
+  if (funcName.empty() || funcName.find('\0') != StringRef::npos)
+    return false;
+
+  // Check for \01 prefix that is used to mangle __asm declarations and
+  // strip it if present.
+  if (funcName.front() == '\01')
+    funcName = funcName.substr(1);
+  const char **I = std::lower_bound(Start, End, funcName, StringComparator());
   if (I != End && *I == funcName) {
     F = (LibFunc::Func)(I - Start);
     return true;
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 7d8b49c..e728251 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -61,6 +63,30 @@ TargetMachine::~TargetMachine() {
   delete AsmInfo;
 }
 
+/// \brief Reset the target options based on the function's attributes.
+void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
+  const Function *F = MF->getFunction();
+  TargetOptions &TO = MF->getTarget().Options;
+  
+#define RESET_OPTION(X, Y)                                              \
+  do {                                                                  \
+    if (F->hasFnAttribute(Y))                                           \
+      TO.X =                                                            \
+        (F->getAttributes().                                            \
+           getAttribute(AttributeSet::FunctionIndex,                    \
+                        Y).getValueAsString() == "true");               \
+  } while (0)
+
+  RESET_OPTION(NoFramePointerElim, "no-frame-pointer-elim");
+  RESET_OPTION(NoFramePointerElimNonLeaf, "no-frame-pointer-elim-non-leaf");
+  RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
+  RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
+  RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
+  RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
+  RESET_OPTION(UseSoftFloat, "use-soft-float");
+  RESET_OPTION(DisableTailCalls, "disable-tail-calls");
+}
+
 /// getRelocationModel - Returns the code generation relocation model. The
 /// choices are static, PIC, and dynamic-no-pic, and target default.
 Reloc::Model TargetMachine::getRelocationModel() const {
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 1e4c195..79f74bd 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -184,7 +184,7 @@ LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   }
 
   if (TM->addPassesToEmitFile(pass, destf, ft)) {
-    error = "No DataLayout in TargetMachine";
+    error = "TargetMachine can't emit a file of this type";
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b2c6d55..4ed5534a6 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -170,30 +170,35 @@ struct X86Operand : public MCParsedAsmOperand {
   SMLoc OffsetOfLoc;
   bool AddressOf;
 
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+    bool NeedAsmRewrite;
+  };
+
+  struct MemOp {
+    unsigned SegReg;
+    const MCExpr *Disp;
+    unsigned BaseReg;
+    unsigned IndexReg;
+    unsigned Scale;
+    unsigned Size;
+    bool NeedSizeDir;
+  };
+
   union {
-    struct {
-      const char *Data;
-      unsigned Length;
-    } Tok;
-
-    struct {
-      unsigned RegNo;
-    } Reg;
-
-    struct {
-      const MCExpr *Val;
-      bool NeedAsmRewrite;
-    } Imm;
-
-    struct {
-      unsigned SegReg;
-      const MCExpr *Disp;
-      unsigned BaseReg;
-      unsigned IndexReg;
-      unsigned Scale;
-      unsigned Size;
-      bool NeedSizeDir;
-    } Mem;
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
   };
 
   X86Operand(KindTy K, SMLoc Start, SMLoc End)
@@ -1734,242 +1739,74 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
   return false;
 }
 
-bool X86AsmParser::
-processInstruction(MCInst &Inst,
-                   const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
-  switch (Inst.getOpcode()) {
-  default: return false;
-  case X86::AND16i16: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::AND16ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::AND32i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::AND32ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::AND64i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::AND64ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::XOR16i16: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::XOR16ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::XOR32i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::XOR32ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::XOR64i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::XOR64ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::OR16i16: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::OR16ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::OR32i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::OR32ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::OR64i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::OR64ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::CMP16i16: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::CMP16ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::CMP32i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
-      return false;
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::CMP32ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::CMP64i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
-      return false;
+static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg,
+                            bool isCmp) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  if (!isCmp)
+    TmpInst.addOperand(MCOperand::CreateReg(Reg));
+  TmpInst.addOperand(MCOperand::CreateReg(Reg));
+  TmpInst.addOperand(Inst.getOperand(0));
+  Inst = TmpInst;
+  return true;
+}
 
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::CMP64ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::ADD16i16: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
-      return false;
+static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+    return false;
 
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::ADD16ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::ADD32i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
-      return false;
+  return convertToSExti8(Inst, Opcode, X86::AX, isCmp);
+}
 
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::ADD32ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::ADD64i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
-      return false;
+static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+    return false;
 
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::ADD64ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::SUB16i16: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
-      return false;
+  return convertToSExti8(Inst, Opcode, X86::EAX, isCmp);
+}
 
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::SUB16ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::SUB32i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
-      return false;
+static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+    return false;
 
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::SUB32ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
-  case X86::SUB64i32: {
-    if (!Inst.getOperand(0).isImm() ||
-        !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
-      return false;
+  return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
+}
 
-    MCInst TmpInst;
-    TmpInst.setOpcode(X86::SUB64ri8);
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
-    TmpInst.addOperand(Inst.getOperand(0));
-    Inst = TmpInst;
-    return true;
-  }
+bool X86AsmParser::
+processInstruction(MCInst &Inst,
+                   const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
+  switch (Inst.getOpcode()) {
+  default: return false;
+  case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
+  case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8);
+  case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8);
+  case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8);
+  case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8);
+  case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8);
+  case X86::OR16i16:  return convert16i16to16ri8(Inst, X86::OR16ri8);
+  case X86::OR32i32:  return convert32i32to32ri8(Inst, X86::OR32ri8);
+  case X86::OR64i32:  return convert64i32to64ri8(Inst, X86::OR64ri8);
+  case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true);
+  case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true);
+  case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true);
+  case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8);
+  case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8);
+  case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8);
+  case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8);
+  case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8);
+  case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8);
+  case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8);
+  case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8);
+  case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8);
+  case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
+  case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
+  case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
   }
 }
 
@@ -2080,7 +1917,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   // Check for the various suffix matches.
   Tmp[Base.size()] = Suffixes[0];
   unsigned ErrorInfoIgnore;
-  unsigned ErrorInfoMissingFeature;
+  unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
   unsigned Match1, Match2, Match3, Match4;
 
   Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index acc90ec..598ddee 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -315,18 +315,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     return true;
   }
 
-  // Write an optimal sequence for the first 15 bytes.
-  const uint64_t OptimalCount = (Count < 16) ? Count : 15;
-  const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
-  for (uint64_t i = 0, e = Prefixes; i != e; i++)
-    OW->Write8(0x66);
-  const uint64_t Rest = OptimalCount - Prefixes;
-  for (uint64_t i = 0, e = Rest; i != e; i++)
-    OW->Write8(Nops[Rest - 1][i]);
-
-  // Finish with single byte nops.
-  for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
-   OW->Write8(0x90);
+  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
+  // needed, then emit a nop of the remaining length.
+  do {
+    const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
+    const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+    for (uint8_t i = 0; i < Prefixes; i++)
+      OW->Write8(0x66);
+    const uint8_t Rest = ThisNopLength - Prefixes;
+    for (uint8_t i = 0; i < Rest; i++)
+      OW->Write8(Nops[Rest - 1][i]);
+    Count -= ThisNopLength;
+  } while (Count != 0);
 
   return true;
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 122204a..5fbefae 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -446,6 +446,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                            raw_ostream &OS) const {
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
 
   // VEX_R: opcode externsion equivalent to REX.R in
   // 1's complement (inverted) form
@@ -650,12 +651,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  dst(ModR/M), src1(ModR/M)
     //  dst(ModR/M), src1(ModR/M), imm8
     //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
     CurOp++;
 
     if (HasVEX_4V)
       VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      CurOp++;
+
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
     CurOp++;
@@ -666,9 +674,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     // MRMDestReg instructions forms:
     //  dst(ModR/M), src(ModR/M)
     //  dst(ModR/M), src(ModR/M), imm8
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+    CurOp++;
+
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
     break;
   case X86II::MRM0r: case X86II::MRM1r:
@@ -1038,9 +1052,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
   case X86II::MRMDestReg:
     EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
     EmitRegModRMByte(MI.getOperand(CurOp),
-                     GetX86RegNum(MI.getOperand(CurOp+1)), CurByte, OS);
-    CurOp += 2;
+                     GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
     break;
 
   case X86II::MRMDestMem:
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index ece38aa..2518e02 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -816,6 +816,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
                                                const MCInstrDesc *Desc) const {
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
 
   // VEX_R: opcode externsion equivalent to REX.R in
   // 1's complement (inverted) form
@@ -1032,6 +1033,10 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
 
       if (HasVEX_4V)
         VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+        CurOp++;
+
       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_B = 0x0;
       CurOp++;
@@ -1042,9 +1047,15 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       // MRMDestReg instructions forms:
       //  dst(ModR/M), src(ModR/M)
       //  dst(ModR/M), src(ModR/M), imm8
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_B = 0x0;
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      CurOp++;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_R = 0x0;
       break;
     case X86II::MRM0r: case X86II::MRM1r:
@@ -1279,9 +1290,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
   case X86II::MRMDestReg: {
     MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp+1;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
-    CurOp += 2;
+                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+    CurOp = SrcRegNum + 1;
     break;
   }
   case X86II::MRMDestMem: {
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index b5c3270..85155f5 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1526,6 +1526,9 @@ bool X86FastISel::FastLowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (Subtarget->isTargetWindows())
+    return false;
+
   const Function *F = FuncInfo.Fn;
   if (F->isVarArg())
     return false;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index a05cf5c..54cbd40 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1386,7 +1386,6 @@ HasNestArgument(const MachineFunction *MF) {
   return false;
 }
 
-
 /// GetScratchRegister - Get a temp register for performing work in the
 /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
 /// and the properties of the function either one or two registers will be
@@ -1612,22 +1611,21 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 #endif
 }
 
-// Erlang programs may need a special prologue to handle the stack size they
-// might need at runtime. That is because Erlang/OTP does not implement a C
-// stack but uses a custom implementation of hybrid stack/heap
-// architecture. (for more information see Eric Stenman's Ph.D. thesis:
-// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
-//
-//
-// CheckStack:
-//	temp0 = sp - MaxStack
-//	if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
-// OldStart:
-//	...
-// IncStack:
-//	call inc_stack   # doubles the stack space
-//	temp0 = sp - MaxStack
-//	if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+///	  temp0 = sp - MaxStack
+///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+///	  ...
+/// IncStack:
+///	  call inc_stack   # doubles the stack space
+///	  temp0 = sp - MaxStack
+///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
   const X86InstrInfo &TII = *TM.getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1c3b9ae..e6858bc 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -85,6 +85,11 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
                                * ElemsPerChunk);
 
+  // If the input is a buildvector just emit a smaller one.
+  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+                       Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+
   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                VecIdx);
@@ -181,9 +186,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
-  // Bypass i32 with i8 on Atom when compiling with O2
-  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+  // Bypass expensive divides on Atom when compiling with O2
+  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
     addBypassSlowDiv(32, 8);
+    if (Subtarget->is64Bit())
+      addBypassSlowDiv(64, 16);
+  }
 
   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
     // Setup Windows compiler runtime calls.
@@ -368,7 +376,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
-  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
   if (Subtarget->is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
@@ -4956,7 +4970,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   return DAG.getNode(ISD::BITCAST, dl, VT,
                      DAG.getNode(Opc, dl, ShVT, SrcOp,
                              DAG.getConstant(NumBits,
-                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
+                                  TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
 }
 
 SDValue
@@ -7820,7 +7834,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                               Chain.getValue(1));
   }
 
-  if (Subtarget->isTargetWindows()) {
+  if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
     // Just use the implicit TLS architecture
     // Need to generate someting similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -7840,18 +7854,19 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue Chain = DAG.getEntryNode();
 
     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
-    // %gs:0x58 (64-bit).
+    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+    // use its literal value of 0x2C.
     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
                                         ? Type::getInt8PtrTy(*DAG.getContext(),
                                                              256)
                                         : Type::getInt32PtrTy(*DAG.getContext(),
                                                               257));
 
-    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
-                                        Subtarget->is64Bit()
-                                        ? DAG.getIntPtrConstant(0x58)
-                                        : DAG.getExternalSymbol("_tls_array",
-                                                                getPointerTy()),
+    SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
+      (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
+        DAG.getExternalSymbol("_tls_array", getPointerTy()));
+
+    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
                                         MachinePointerInfo(Ptr),
                                         false, false, false, 0);
 
@@ -12248,7 +12263,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::UINT_TO_FP: {
-    if (N->getOperand(0).getValueType() != MVT::v2i32 &&
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (N->getOperand(0).getValueType() != MVT::v2i32 ||
         N->getValueType(0) != MVT::v2f32)
       return;
     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
@@ -12890,13 +12906,16 @@ static unsigned getPseudoCMOVOpc(EVT VT) {
 // to
 //
 //    ...
-//    EAX = LOAD MI.addr
+//    t1 = LOAD MI.addr
 // loop:
-//    t1 = OP MI.val, EAX
-//    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+//    t4 = phi(t1, t3 / loop)
+//    t2 = OP MI.val, t4
+//    EAX = t4
+//    LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
+//    t3 = EAX
 //    JNE loop
 // sink:
-//    dst = EAX
+//    dst = t3
 //    ...
 MachineBasicBlock *
 X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
@@ -12933,7 +12952,11 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
 
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   MVT::SimpleValueType VT = *RC->vt_begin();
-  unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
+  unsigned t1 = MRI.createVirtualRegister(RC);
+  unsigned t2 = MRI.createVirtualRegister(RC);
+  unsigned t3 = MRI.createVirtualRegister(RC);
+  unsigned t4 = MRI.createVirtualRegister(RC);
+  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
 
   unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
   unsigned LOADOpc = getLoadOpcode(VT);
@@ -12941,12 +12964,16 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
   // For the atomic load-arith operator, we generate
   //
   //  thisMBB:
-  //    EAX = LOAD [MI.addr]
+  //    t1 = LOAD [MI.addr]
   //  mainMBB:
+  //    t4 = phi(t1 / thisMBB, t3 / mainMBB)
   //    t1 = OP MI.val, EAX
+  //    EAX = t4
   //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+  //    t3 = EAX
   //    JNE mainMBB
   //  sinkMBB:
+  //    dst = t3
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
@@ -12962,23 +12989,34 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
-  MIB.setMemRefs(MMOBegin, MMOEnd);
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
+  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+    unsigned flags = (*MMOI)->getFlags();
+    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+    MachineMemOperand *MMO =
+      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+                               (*MMOI)->getSize(),
+                               (*MMOI)->getBaseAlignment(),
+                               (*MMOI)->getTBAAInfo(),
+                               (*MMOI)->getRanges());
+    MIB.addMemOperand(MMO);
+  }
 
   thisMBB->addSuccessor(mainMBB);
 
   // mainMBB:
   MachineBasicBlock *origMainMBB = mainMBB;
-  mainMBB->addLiveIn(AccPhyReg);
 
-  // Copy AccPhyReg as it is used more than once.
-  unsigned AccReg = MRI.createVirtualRegister(RC);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
-    .addReg(AccPhyReg);
+  // Add a PHI.
+  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
+                        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
 
-  unsigned t1 = MRI.createVirtualRegister(RC);
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
   default:
@@ -12996,20 +13034,20 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
   case X86::ATOMXOR32:
   case X86::ATOMXOR64: {
     unsigned ARITHOpc = getNonAtomicOpcode(Opc);
-    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
-      .addReg(AccReg);
+    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
+      .addReg(t4);
     break;
   }
   case X86::ATOMNAND8:
   case X86::ATOMNAND16:
   case X86::ATOMNAND32:
   case X86::ATOMNAND64: {
-    unsigned t2 = MRI.createVirtualRegister(RC);
+    unsigned Tmp = MRI.createVirtualRegister(RC);
     unsigned NOTOpc;
     unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
-    BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
-      .addReg(AccReg);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
+    BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
+      .addReg(t4);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
     break;
   }
   case X86::ATOMMAX8:
@@ -13033,20 +13071,22 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
 
     BuildMI(mainMBB, DL, TII->get(CMPOpc))
       .addReg(SrcReg)
-      .addReg(AccReg);
+      .addReg(t4);
 
     if (Subtarget->hasCMov()) {
       if (VT != MVT::i8) {
         // Native support
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
           .addReg(SrcReg)
-          .addReg(AccReg);
+          .addReg(t4);
       } else {
         // Promote i8 to i32 to use CMOV32
-        const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
+        const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+        const TargetRegisterClass *RC32 =
+          TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
         unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
         unsigned AccReg32 = MRI.createVirtualRegister(RC32);
-        unsigned t2 = MRI.createVirtualRegister(RC32);
+        unsigned Tmp = MRI.createVirtualRegister(RC32);
 
         unsigned Undef = MRI.createVirtualRegister(RC32);
         BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
@@ -13057,15 +13097,15 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
           .addImm(X86::sub_8bit);
         BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
           .addReg(Undef)
-          .addReg(AccReg)
+          .addReg(t4)
           .addImm(X86::sub_8bit);
 
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
           .addReg(SrcReg32)
           .addReg(AccReg32);
 
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
-          .addReg(t2, 0, X86::sub_8bit);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
+          .addReg(Tmp, 0, X86::sub_8bit);
       }
     } else {
       // Use pseudo select and lower them.
@@ -13074,36 +13114,47 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
       unsigned SelOpc = getPseudoCMOVOpc(VT);
       X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
       assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
-      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
-              .addReg(SrcReg).addReg(AccReg)
+      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
+              .addReg(SrcReg).addReg(t4)
               .addImm(CC);
       mainMBB = EmitLoweredSelect(MIB, mainMBB);
+      // Replace the original PHI node as mainMBB is changed after CMOV
+      // lowering.
+      BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
+        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
+      Phi->eraseFromParent();
     }
     break;
   }
   }
 
-  // Copy AccPhyReg back from virtual register.
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
-    .addReg(AccReg);
+  // Copy PhyReg back from virtual register.
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
+    .addReg(t4);
 
   MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
-  MIB.addReg(t1);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
+  MIB.addReg(t2);
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
+  // Copy PhyReg back to virtual register.
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
+    .addReg(PhyReg);
+
   BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
 
   mainMBB->addSuccessor(origMainMBB);
   mainMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
-  sinkMBB->addLiveIn(AccPhyReg);
-
   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
           TII->get(TargetOpcode::COPY), DstReg)
-    .addReg(AccPhyReg);
+    .addReg(t3);
 
   MI->eraseFromParent();
   return sinkMBB;
@@ -13120,15 +13171,24 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
 // to
 //
 //    ...
-//    EAX = LOAD [MI.addr + 0]
-//    EDX = LOAD [MI.addr + 4]
+//    t1L = LOAD [MI.addr + 0]
+//    t1H = LOAD [MI.addr + 4]
 // loop:
-//    EBX = OP MI.val.lo, EAX
-//    ECX = OP MI.val.hi, EDX
+//    t4L = phi(t1L, t3L / loop)
+//    t4H = phi(t1H, t3H / loop)
+//    t2L = OP MI.val.lo, t4L
+//    t2H = OP MI.val.hi, t4H
+//    EAX = t4L
+//    EDX = t4H
+//    EBX = t2L
+//    ECX = t2H
 //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+//    t3L = EAX
+//    t3H = EDX
 //    JNE loop
 // sink:
-//    dst = EDX:EAX
+//    dstL = t3L
+//    dstH = t3H
 //    ...
 MachineBasicBlock *
 X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
@@ -13169,20 +13229,37 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
   const TargetRegisterClass *RC = &X86::GR32RegClass;
   const TargetRegisterClass *RC8 = &X86::GR8RegClass;
 
+  unsigned t1L = MRI.createVirtualRegister(RC);
+  unsigned t1H = MRI.createVirtualRegister(RC);
+  unsigned t2L = MRI.createVirtualRegister(RC);
+  unsigned t2H = MRI.createVirtualRegister(RC);
+  unsigned t3L = MRI.createVirtualRegister(RC);
+  unsigned t3H = MRI.createVirtualRegister(RC);
+  unsigned t4L = MRI.createVirtualRegister(RC);
+  unsigned t4H = MRI.createVirtualRegister(RC);
+
   unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
   unsigned LOADOpc = X86::MOV32rm;
 
   // For the atomic load-arith operator, we generate
   //
   //  thisMBB:
-  //    EAX = LOAD [MI.addr + 0]
-  //    EDX = LOAD [MI.addr + 4]
+  //    t1L = LOAD [MI.addr + 0]
+  //    t1H = LOAD [MI.addr + 4]
   //  mainMBB:
-  //    EBX = OP MI.vallo, EAX
-  //    ECX = OP MI.valhi, EDX
+  //    t4L = phi(t1L / thisMBB, t3L / mainMBB)
+  //    t4H = phi(t1H / thisMBB, t3H / mainMBB)
+  //    t2L = OP MI.val.lo, t4L
+  //    t2H = OP MI.val.hi, t4H
+  //    EBX = t2L
+  //    ECX = t2H
   //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-  //    JNE mainMBB
+  //    t3L = EAX
+  //    t3H = EDX
+  //    JNE loop
   //  sinkMBB:
+  //    dstL = t3L
+  //    dstH = t3H
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
@@ -13199,35 +13276,50 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
 
   // thisMBB:
   // Lo
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
-  MIB.setMemRefs(MMOBegin, MMOEnd);
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
+  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+    unsigned flags = (*MMOI)->getFlags();
+    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+    MachineMemOperand *MMO =
+      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+                               (*MMOI)->getSize(),
+                               (*MMOI)->getBaseAlignment(),
+                               (*MMOI)->getTBAAInfo(),
+                               (*MMOI)->getRanges());
+    MIB.addMemOperand(MMO);
+  };
+  MachineInstr *LowMI = MIB;
+
   // Hi
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    if (i == X86::AddrDisp)
+    if (i == X86::AddrDisp) {
       MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
-    else
-      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+    } else {
+      MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+      if (NewMO.isReg())
+        NewMO.setIsKill(false);
+      MIB.addOperand(NewMO);
+    }
   }
-  MIB.setMemRefs(MMOBegin, MMOEnd);
+  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
 
   thisMBB->addSuccessor(mainMBB);
 
   // mainMBB:
   MachineBasicBlock *origMainMBB = mainMBB;
-  mainMBB->addLiveIn(X86::EAX);
-  mainMBB->addLiveIn(X86::EDX);
-
-  // Copy EDX:EAX as they are used more than once.
-  unsigned LoReg = MRI.createVirtualRegister(RC);
-  unsigned HiReg = MRI.createVirtualRegister(RC);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
 
-  unsigned t1L = MRI.createVirtualRegister(RC);
-  unsigned t1H = MRI.createVirtualRegister(RC);
+  // Add PHIs.
+  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
+                        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
+                        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
 
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
@@ -13240,19 +13332,23 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
   case X86::ATOMSUB6432: {
     unsigned HiOpc;
     unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
+      .addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
+      .addReg(SrcHiReg);
     break;
   }
   case X86::ATOMNAND6432: {
     unsigned HiOpc, NOTOpc;
     unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
-    unsigned t2L = MRI.createVirtualRegister(RC);
-    unsigned t2H = MRI.createVirtualRegister(RC);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
+    unsigned TmpL = MRI.createVirtualRegister(RC);
+    unsigned TmpH = MRI.createVirtualRegister(RC);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
+      .addReg(t4L);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
+      .addReg(t4H);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
     break;
   }
   case X86::ATOMMAX6432:
@@ -13268,12 +13364,12 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
     unsigned cc = MRI.createVirtualRegister(RC);
     // cl := cmp src_lo, lo
     BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcLoReg).addReg(LoReg);
+      .addReg(SrcLoReg).addReg(t4L);
     BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
     BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
     // ch := cmp src_hi, hi
     BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcHiReg).addReg(HiReg);
+      .addReg(SrcHiReg).addReg(t4H);
     BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
     BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
     // cc := if (src_hi == hi) ? cl : ch;
@@ -13288,58 +13384,74 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
     }
     BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
     if (Subtarget->hasCMov()) {
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L)
-        .addReg(SrcLoReg).addReg(LoReg);
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H)
-        .addReg(SrcHiReg).addReg(HiReg);
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
+        .addReg(SrcLoReg).addReg(t4L);
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
+        .addReg(SrcHiReg).addReg(t4H);
     } else {
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L)
-              .addReg(SrcLoReg).addReg(LoReg)
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
+              .addReg(SrcLoReg).addReg(t4L)
               .addImm(X86::COND_NE);
       mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H)
-              .addReg(SrcHiReg).addReg(HiReg)
+      // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
+      // 2nd CMOV lowering.
+      mainMBB->addLiveIn(X86::EFLAGS);
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
+              .addReg(SrcHiReg).addReg(t4H)
               .addImm(X86::COND_NE);
       mainMBB = EmitLoweredSelect(MIB, mainMBB);
+      // Replace the original PHI node as mainMBB is changed after CMOV
+      // lowering.
+      BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
+        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+      BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
+        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
+      PhiL->eraseFromParent();
+      PhiH->eraseFromParent();
     }
     break;
   }
   case X86::ATOMSWAP6432: {
     unsigned HiOpc;
     unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
     break;
   }
   }
 
   // Copy EDX:EAX back from HiReg:LoReg
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
   // Copy ECX:EBX from t1H:t1L
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
 
   MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
+  // Copy EDX:EAX back to t3H:t3L
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
+
   BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
 
   mainMBB->addSuccessor(origMainMBB);
   mainMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
-  sinkMBB->addLiveIn(X86::EAX);
-  sinkMBB->addLiveIn(X86::EDX);
-
   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
           TII->get(TargetOpcode::COPY), DstLoReg)
-    .addReg(X86::EAX);
+    .addReg(t3L);
   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
           TII->get(TargetOpcode::COPY), DstHiReg)
-    .addReg(X86::EDX);
+    .addReg(t3H);
 
   MI->eraseFromParent();
   return sinkMBB;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 958ceb0..da1dad0 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -471,7 +471,7 @@ namespace llvm {
 
     virtual unsigned getJumpTableEncoding() const;
 
-    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
 
     virtual const MCExpr *
     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index d86a406..f406416 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -14,7 +14,7 @@
 
 //===----------------------------------------------------------------------===//
 // LEA - Load Effective Address
-
+let SchedRW = [WriteLEA] in {
 let neverHasSideEffects = 1 in
 def LEA16r   : I<0x8D, MRMSrcMem,
                  (outs GR16:$dst), (ins i32mem:$src),
@@ -36,41 +36,52 @@ let isReMaterializable = 1 in
 def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
                   "lea{q}\t{$src|$dst}, {$dst|$src}",
                   [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
-
-
+} // SchedRW
 
 //===----------------------------------------------------------------------===//
 //  Fixed-Register Multiplication and Division Instructions.
 //
 
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+  // Memory operand.
+  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+  // Register reads (implicit or explicit).
+  ReadAfterLd, ReadAfterLd]>;
+
 // Extra precision multiplication
 
 // AL is really implied by AX, but the registers in Defs must match the
 // SDNode results (i8, i32).
+// AL,AH = AL*GR8
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
                // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
                // This probably ought to be moved to a def : Pat<> if the
                // syntax can be accepted.
                [(set AL, (mul AL, GR8:$src)),
-                (implicit EFLAGS)], IIC_MUL8>;     // AL,AH = AL*GR8
-
+                (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
 let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
 def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
                "mul{w}\t$src",
-               [], IIC_MUL16_REG>, OpSize;    // AX,DX = AX*GR16
-
+               [], IIC_MUL16_REG>, OpSize, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
 def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
-               "mul{l}\t$src",   // EAX,EDX = EAX*GR32
+               "mul{l}\t$src",
                [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
-               IIC_MUL32_REG>;
+               IIC_MUL32_REG>, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in
 def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
-                "mul{q}\t$src",          // RAX,RDX = RAX*GR64
+                "mul{q}\t$src",
                 [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
-                IIC_MUL64>;
-
+                IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
                "mul{b}\t$src",
@@ -78,51 +89,60 @@ def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
                // This probably ought to be moved to a def : Pat<> if the
                // syntax can be accepted.
                [(set AL, (mul AL, (loadi8 addr:$src))),
-                (implicit EFLAGS)], IIC_MUL8>;   // AL,AH = AL*[mem8]
-
+                (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
 let mayLoad = 1, neverHasSideEffects = 1 in {
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
                "mul{w}\t$src",
-               [], IIC_MUL16_MEM>, OpSize; // AX,DX = AX*[mem16]
-
+               [], IIC_MUL16_MEM>, OpSize, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
               "mul{l}\t$src",
-              [], IIC_MUL32_MEM>;          // EAX,EDX = EAX*[mem32]
+              [], IIC_MUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
-                "mul{q}\t$src", [], IIC_MUL64>;   // RAX,RDX = RAX*[mem64]
+                "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
 }
 
 let neverHasSideEffects = 1 in {
+// AL,AH = AL*GR8
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
-              IIC_IMUL8>; // AL,AH = AL*GR8
+              IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
-              IIC_IMUL16_RR>, OpSize;    // AX,DX = AX*GR16
+              IIC_IMUL16_RR>, OpSize, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
-              IIC_IMUL32_RR>; // EAX,EDX = EAX*GR32
+              IIC_IMUL32_RR>, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
-              IIC_IMUL64_RR>; // RAX,RDX = RAX*GR64
+              IIC_IMUL64_RR>, Sched<[WriteIMul]>;
 
 let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
-                "imul{b}\t$src", [], IIC_IMUL8>;    // AL,AH = AL*[mem8]
+                "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
-                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize;
-                // AX,DX = AX*[mem16]
+                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize,
+              SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
-                "imul{l}\t$src", [], IIC_IMUL32_MEM>;  // EAX,EDX = EAX*[mem32]
+                "imul{l}\t$src", [], IIC_IMUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
-                 "imul{q}\t$src", [], IIC_IMUL64>;  // RAX,RDX = RAX*[mem64]
+                 "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
 }
 } // neverHasSideEffects
 
@@ -130,7 +150,8 @@ def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
 let Defs = [EFLAGS] in {
 let Constraints = "$src1 = $dst" in {
 
-let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
 // Register-Register Signed Integer Multiply
 def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
@@ -148,9 +169,10 @@ def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
                   [(set GR64:$dst, EFLAGS,
                         (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
                  TB;
-}
+} // isCommutable, SchedRW
 
 // Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
 def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
                                   (ins GR16:$src1, i16mem:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
@@ -172,12 +194,14 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
                         (X86smul_flag GR64:$src1, (load addr:$src2)))],
                         IIC_IMUL64_RM>,
                TB;
+} // SchedRW
 } // Constraints = "$src1 = $dst"
 
 } // Defs = [EFLAGS]
 
 // Surprisingly enough, these are not two address instructions!
 let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
 // Register-Integer Signed Integer Multiply
 def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
                       (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
@@ -216,9 +240,10 @@ def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
                       [(set GR64:$dst, EFLAGS,
                             (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
                             IIC_IMUL64_RRI>;
-
+} // SchedRW
 
 // Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
 def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
                       (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -260,6 +285,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
                             (X86smul_flag (load addr:$src1),
                                           i64immSExt8:$src2))],
                                           IIC_IMUL64_RMI>;
+} // SchedRW
 } // Defs = [EFLAGS]
 
 
@@ -267,6 +293,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
 
 // unsigned division/remainder
 let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
 let Defs = [AL,EFLAGS,AX], Uses = [AX] in
 def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "div{b}\t$src", [], IIC_DIV8_REG>;
@@ -280,24 +307,30 @@ def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
                 "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
 
 let mayLoad = 1 in {
 let Defs = [AL,EFLAGS,AX], Uses = [AX] in
 def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
-               "div{b}\t$src", [], IIC_DIV8_MEM>;
+               "div{b}\t$src", [], IIC_DIV8_MEM>,
+             SchedLoadReg<WriteIDivLd>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
-               "div{w}\t$src", [], IIC_DIV16>, OpSize;
+               "div{w}\t$src", [], IIC_DIV16>, OpSize,
+             SchedLoadReg<WriteIDivLd>;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
 def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
-               "div{l}\t$src", [], IIC_DIV32>;
+               "div{l}\t$src", [], IIC_DIV32>,
+             SchedLoadReg<WriteIDivLd>;
 // RDX:RAX/[mem64] = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
-                "div{q}\t$src", [], IIC_DIV64>;
+                "div{q}\t$src", [], IIC_DIV64>,
+             SchedLoadReg<WriteIDivLd>;
 }
 
 // Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
 let Defs = [AL,EFLAGS,AX], Uses = [AX] in
 def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "idiv{b}\t$src", [], IIC_IDIV8>;
@@ -311,20 +344,25 @@ def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
                 "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
 
 let mayLoad = 1 in {
 let Defs = [AL,EFLAGS,AX], Uses = [AX] in
 def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
-               "idiv{b}\t$src", [], IIC_IDIV8>;
+               "idiv{b}\t$src", [], IIC_IDIV8>,
+             SchedLoadReg<WriteIDivLd>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
-               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize;
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize,
+             SchedLoadReg<WriteIDivLd>;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
 def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
-               "idiv{l}\t$src", [], IIC_IDIV32>;
+               "idiv{l}\t$src", [], IIC_IDIV32>,
+             SchedLoadReg<WriteIDivLd>;
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
 def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
-                "idiv{q}\t$src", [], IIC_IDIV64>;
+                "idiv{q}\t$src", [], IIC_IDIV64>,
+             SchedLoadReg<WriteIDivLd>;
 }
 } // hasSideEffects = 0
 
@@ -335,7 +373,7 @@ def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
 // unary instructions
 let CodeSize = 2 in {
 let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
 def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
                "neg{b}\t$dst",
                [(set GR8:$dst, (ineg GR8:$src1)),
@@ -351,8 +389,10 @@ def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
 def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
                 [(set GR64:$dst, (ineg GR64:$src1)),
                  (implicit EFLAGS)], IIC_UNARY_REG>;
-} // Constraints = "$src1 = $dst"
+} // Constraints = "$src1 = $dst", SchedRW
 
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
 def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
                "neg{b}\t$dst",
                [(store (ineg (loadi8 addr:$dst)), addr:$dst),
@@ -368,12 +408,13 @@ def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
 def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
                 [(store (ineg (loadi64 addr:$dst)), addr:$dst),
                  (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
 } // Defs = [EFLAGS]
 
 
 // Note: NOT does not set EFLAGS!
 
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
 // Match xor -1 to not. Favors these over a move imm + xor to save code size.
 let AddedComplexity = 15 in {
 def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
@@ -388,8 +429,9 @@ def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
 def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
                 [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
 }
-} // Constraints = "$src1 = $dst"
+} // Constraints = "$src1 = $dst", SchedRW
 
+let SchedRW = [WriteALULd, WriteRMW] in {
 def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
                "not{b}\t$dst",
                [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
@@ -402,11 +444,12 @@ def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
                [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
 def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
                 [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
 } // CodeSize
 
 // TODO: inc/dec is slow for P4, but fast for Pentium-M.
 let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
 let CodeSize = 2 in
 def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
                "inc{b}\t$dst",
@@ -454,9 +497,9 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                 Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
-} // Constraints = "$src1 = $dst"
+} // Constraints = "$src1 = $dst", SchedRW
 
-let CodeSize = 2 in {
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
   def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
                [(store (add (loadi8 addr:$dst), 1), addr:$dst),
                 (implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -491,9 +534,9 @@ def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
                   [(store (add (loadi32 addr:$dst), -1), addr:$dst),
                     (implicit EFLAGS)], IIC_UNARY_MEM>,
                 Requires<[In64BitMode]>;
-} // CodeSize = 2
+} // CodeSize = 2, SchedRW
 
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
 let CodeSize = 2 in
 def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                "dec{b}\t$dst",
@@ -514,10 +557,10 @@ def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
                 [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
                 IIC_UNARY_REG>;
 } // CodeSize = 2
-} // Constraints = "$src1 = $dst"
+} // Constraints = "$src1 = $dst", SchedRW
 
 
-let CodeSize = 2 in {
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
   def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
                [(store (add (loadi8 addr:$dst), -1), addr:$dst),
                 (implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -532,7 +575,7 @@ let CodeSize = 2 in {
   def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
                   [(store (add (loadi64 addr:$dst), -1), addr:$dst),
                    (implicit EFLAGS)], IIC_UNARY_MEM>;
-} // CodeSize = 2
+} // CodeSize = 2, SchedRW
 } // Defs = [EFLAGS]
 
 
@@ -646,7 +689,8 @@ class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               Format f = MRMDestReg>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>;
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]>;
 
 // BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has
 // just a regclass (no eflags) as a result.
@@ -689,7 +733,8 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo,
         (outs typeinfo.RegClass:$dst),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM> {
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
   let hasSideEffects = 0;
@@ -699,7 +744,8 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
 class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo, (outs),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
   let hasSideEffects = 0;
@@ -710,7 +756,8 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               dag outlist, list<dag> pattern>
   : ITy<opcode, MRMSrcMem, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>;
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+    Sched<[WriteALULd, ReadAfterLd]>;
 
 // BinOpRM_R - Instructions like "add reg, reg, [mem]".
 class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -746,7 +793,8 @@ class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               Format f, dag outlist, list<dag> pattern>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
 }
 
@@ -783,7 +831,8 @@ class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                Format f, dag outlist, list<dag> pattern>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
 
@@ -821,7 +870,8 @@ class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               list<dag> pattern>
   : ITy<opcode, MRMDestMem, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>;
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+    Sched<[WriteALULd, WriteRMW]>;
 
 // BinOpMR_RMW - Instructions like "add [mem], reg".
 class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -849,7 +899,8 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
               Format f, list<dag> pattern, bits<8> opcode = 0x80>
   : ITy<opcode, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM> {
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+    Sched<[WriteALULd, WriteRMW]> {
   let ImmT = typeinfo.ImmEncoding;
 }
 
@@ -1210,11 +1261,12 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
   def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
-            IIC_BIN_NONMEM>;
+            IIC_BIN_NONMEM>, Sched<[WriteALU]>;
   def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [(set RC:$dst, EFLAGS,
-             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>;
+             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+           Sched<[WriteALULd, ReadAfterLd]>;
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 0979752..105963f 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -16,6 +16,8 @@
 class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
   InstrItinClass rr = arg_rr;
   InstrItinClass rm = arg_rm;
+  // InstrSchedModel info.
+  X86FoldableSchedWrite Sched = WriteFAdd;
 }
 
 class SizeItins<OpndItins arg_s, OpndItins arg_d> {
@@ -45,6 +47,7 @@ def SSE_ALU_ITINS_S : SizeItins<
   SSE_ALU_F32S, SSE_ALU_F64S
 >;
 
+let Sched = WriteFMul in {
 def SSE_MUL_F32S : OpndItins<
   IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
 >;
@@ -52,11 +55,13 @@ def SSE_MUL_F32S : OpndItins<
 def SSE_MUL_F64S : OpndItins<
   IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
 >;
+}
 
 def SSE_MUL_ITINS_S : SizeItins<
   SSE_MUL_F32S, SSE_MUL_F64S
 >;
 
+let Sched = WriteFDiv in {
 def SSE_DIV_F32S : OpndItins<
   IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
 >;
@@ -64,6 +69,7 @@ def SSE_DIV_F32S : OpndItins<
 def SSE_DIV_F64S : OpndItins<
   IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
 >;
+}
 
 def SSE_DIV_ITINS_S : SizeItins<
   SSE_DIV_F32S, SSE_DIV_F64S
@@ -82,6 +88,7 @@ def SSE_ALU_ITINS_P : SizeItins<
   SSE_ALU_F32P, SSE_ALU_F64P
 >;
 
+let Sched = WriteFMul in {
 def SSE_MUL_F32P : OpndItins<
   IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
 >;
@@ -89,11 +96,13 @@ def SSE_MUL_F32P : OpndItins<
 def SSE_MUL_F64P : OpndItins<
   IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
 >;
+}
 
 def SSE_MUL_ITINS_P : SizeItins<
   SSE_MUL_F32P, SSE_MUL_F64P
 >;
 
+let Sched = WriteFDiv in {
 def SSE_DIV_F32P : OpndItins<
   IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
 >;
@@ -101,6 +110,7 @@ def SSE_DIV_F32P : OpndItins<
 def SSE_DIV_F64P : OpndItins<
   IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
 >;
+}
 
 def SSE_DIV_ITINS_P : SizeItins<
   SSE_DIV_F32P, SSE_DIV_F64P
@@ -110,6 +120,7 @@ def SSE_BIT_ITINS_P : OpndItins<
   IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
 >;
 
+let Sched = WriteVecALU in {
 def SSE_INTALU_ITINS_P : OpndItins<
   IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
 >;
@@ -117,7 +128,9 @@ def SSE_INTALU_ITINS_P : OpndItins<
 def SSE_INTALUQ_ITINS_P : OpndItins<
   IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
 >;
+}
 
+let Sched = WriteVecIMul in
 def SSE_INTMUL_ITINS_P : OpndItins<
   IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
 >;
@@ -148,13 +161,15 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>;
+       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
   }
   def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>;
+       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
@@ -189,14 +204,16 @@ multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>;
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       Sched<[itins.Sched]>;
   let mayLoad = 1 in
     def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
-          itins.rm, d>;
+          itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
@@ -209,12 +226,14 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       pat_rr, IIC_DEFAULT, d>;
+       pat_rr, IIC_DEFAULT, d>,
+       Sched<[WriteVecLogic]>;
   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       pat_rm, IIC_DEFAULT, d>;
+       pat_rm, IIC_DEFAULT, d>,
+       Sched<[WriteVecLogicLd, ReadAfterLd]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -444,7 +463,7 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
               !strconcat(base_opc, asm_opr),
               [(set VR128:$dst, (vt (OpNode VR128:$src1,
                                  (scalar_to_vector RC:$src2))))],
-              IIC_SSE_MOV_S_RR>;
+              IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
 
   // For the disassembler
   let isCodeGenOnly = 1, hasSideEffects = 0 in
@@ -464,7 +483,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
-                     VEX, VEX_LIG;
+                     VEX, VEX_LIG, Sched<[WriteStore]>;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
     defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
@@ -473,7 +492,8 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
 
   def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                  Sched<[WriteStore]>;
 }
 
 // Loading from memory automatically zeroing upper bits.
@@ -482,11 +502,11 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG;
+                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM>;
+                     IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
@@ -745,11 +765,13 @@ multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
                             bit IsReMaterializable = 1> {
 let neverHasSideEffects = 1 in
   def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>;
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+           Sched<[WriteMove]>;
 let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>;
+                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+           Sched<[WriteLoad]>;
 }
 
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
@@ -790,6 +812,7 @@ defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
                               TB, OpSize;
 
+let SchedRW = [WriteStore] in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
@@ -822,6 +845,7 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)],
                    IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
 
 // For disassembler
 let isCodeGenOnly = 1, hasSideEffects = 0 in {
@@ -880,6 +904,7 @@ def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
 def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
           (VMOVUPDYmr addr:$dst, VR256:$src)>;
 
+let SchedRW = [WriteStore] in {
 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
@@ -896,6 +921,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v2f64 VR128:$src), addr:$dst)],
                    IIC_SSE_MOVU_P_MR>;
+} // SchedRW
 
 // For disassembler
 let isCodeGenOnly = 1, hasSideEffects = 0 in {
@@ -1009,7 +1035,7 @@ let Predicates = [HasAVX] in {
             (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   def : Pat<(store (v8i16 (extract_subvector
                            (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   def : Pat<(store (v16i8 (extract_subvector
                            (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
             (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
@@ -1095,14 +1121,16 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
      [(set VR128:$dst,
        (psnode VR128:$src1,
               (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
-              itin, SSEPackedSingle>, TB;
+              itin, SSEPackedSingle>, TB,
+     Sched<[WriteShuffleLd, ReadAfterLd]>;
 
   def PDrm : PI<opc, MRMSrcMem,
          (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
          !strconcat(base_opc, "d", asm_opr),
      [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
                               (scalar_to_vector (loadf64 addr:$src2)))))],
-              itin, SSEPackedDouble>, TB, OpSize;
+              itin, SSEPackedDouble>, TB, OpSize,
+     Sched<[WriteShuffleLd, ReadAfterLd]>;
 
 }
 
@@ -1123,6 +1151,7 @@ let AddedComplexity = 20 in {
                                     IIC_SSE_MOV_LH>;
 }
 
+let SchedRW = [WriteStore] in {
 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
@@ -1143,6 +1172,7 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    [(store (f64 (vector_extract (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)],
                                  IIC_SSE_MOV_LH>;
+} // SchedRW
 
 let Predicates = [HasAVX] in {
   // Shuffle with VMOVLPS
@@ -1222,6 +1252,7 @@ let AddedComplexity = 20 in {
                                     IIC_SSE_MOV_LH>;
 }
 
+let SchedRW = [WriteStore] in {
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
@@ -1246,6 +1277,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    [(store (f64 (vector_extract
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
 
 let Predicates = [HasAVX] in {
   // VMOVHPS patterns
@@ -1296,14 +1328,14 @@ let AddedComplexity = 20 in {
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V;
+                      VEX_4V, Sched<[WriteShuffle]>;
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V;
+                      VEX_4V, Sched<[WriteShuffle]>;
 }
 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -1311,13 +1343,13 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                       "movlhps\t{$src2, $dst|$dst, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>;
+                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
   def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $dst|$dst, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>;
+                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -1352,22 +1384,27 @@ def SSE_CVT_PD : OpndItins<
   IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
 >;
 
+let Sched = WriteCvtI2F in
 def SSE_CVT_PS : OpndItins<
   IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
 >;
 
+let Sched = WriteCvtI2F in
 def SSE_CVT_Scalar : OpndItins<
   IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
 >;
 
+let Sched = WriteCvtF2I in
 def SSE_CVT_SS2SI_32 : OpndItins<
   IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
 >;
 
+let Sched = WriteCvtF2I in
 def SSE_CVT_SS2SI_64 : OpndItins<
   IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
 >;
 
+let Sched = WriteCvtF2I in
 def SSE_CVT_SD2SI : OpndItins<
   IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
 >;
@@ -1377,10 +1414,10 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                      string asm, OpndItins itins> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
                         [(set DstRC:$dst, (OpNode SrcRC:$src))],
-                        itins.rr>;
+                        itins.rr>, Sched<[itins.Sched]>;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
                         [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
-                        itins.rm>;
+                        itins.rm>, Sched<[itins.Sched.Folded]>;
 }
 
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
@@ -1388,10 +1425,10 @@ multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                        OpndItins itins> {
 let neverHasSideEffects = 1 in {
   def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-             [], itins.rr, d>;
+             [], itins.rr, d>, Sched<[itins.Sched]>;
   let mayLoad = 1 in
   def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-             [], itins.rm, d>;
+             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
 }
 }
 
@@ -1534,10 +1571,12 @@ multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          string asm, OpndItins itins> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>;
+              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+           Sched<[itins.Sched]>;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>;
+              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+           Sched<[itins.Sched.Folded]>;
 }
 
 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -1549,14 +1588,14 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
                   !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
                   !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
               [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
-              itins.rr>;
+              itins.rr>, Sched<[itins.Sched]>;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins DstRC:$src1, x86memop:$src2),
               !if(Is2Addr,
                   !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
                   !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
               [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
-              itins.rm>;
+              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
@@ -2193,12 +2232,13 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   def rr : SIi8<0xC2, MRMSrcReg,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
-                itins.rr>;
+                itins.rr>, Sched<[itins.Sched]>;
   def rm : SIi8<0xC2, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1),
                                          (ld_frag addr:$src2), imm:$cc))],
-                                         itins.rm>;
+                                         itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let neverHasSideEffects = 1 in {
@@ -2241,12 +2281,14 @@ multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
                       (ins VR128:$src1, VR128:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
                                                VR128:$src, imm:$cc))],
-                                               itins.rr>;
+                                               itins.rr>,
+           Sched<[itins.Sched]>;
   def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
                                                (load addr:$src), imm:$cc))],
-                                               itins.rm>;
+                                               itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 // Aliases to match intrinsics which expect XMM operand(s).
@@ -2276,12 +2318,14 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
   def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
-                     IIC_SSE_COMIS_RR, d>;
+                     IIC_SSE_COMIS_RR, d>,
+          Sched<[WriteFAdd]>;
   def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
                                            (ld_frag addr:$src2)))],
-                                           IIC_SSE_COMIS_RM, d>;
+                                           IIC_SSE_COMIS_RM, d>,
+          Sched<[WriteFAddLd, ReadAfterLd]>;
 }
 
 let Defs = [EFLAGS] in {
@@ -2338,11 +2382,13 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
-             IIC_SSE_CMPP_RR, d>;
+             IIC_SSE_CMPP_RR, d>,
+            Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
-             IIC_SSE_CMPP_RM, d>;
+             IIC_SSE_CMPP_RM, d>,
+            Sched<[WriteFAddLd, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let neverHasSideEffects = 1 in {
@@ -2427,12 +2473,14 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
   def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
-                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>;
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteShuffleLd, ReadAfterLd]>;
   let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
     def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
-                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>;
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+              Sched<[WriteShuffle]>;
 }
 
 defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2516,13 +2564,14 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
                 asm, [(set RC:$dst,
                            (vt (OpNode RC:$src1, RC:$src2)))],
-                           IIC_SSE_UNPCK, d>;
+                           IIC_SSE_UNPCK, d>, Sched<[WriteShuffle]>;
     def rm : PI<opc, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                 asm, [(set RC:$dst,
                            (vt (OpNode RC:$src1,
                                        (mem_frag addr:$src2))))],
-                                       IIC_SSE_UNPCK, d>;
+                                       IIC_SSE_UNPCK, d>,
+             Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
@@ -2613,10 +2662,11 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
                                 Domain d> {
   def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
                 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                     [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>;
+                     [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+             Sched<[WriteVecLogic]>;
   def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
                 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
-                IIC_SSE_MOVMSK, d>, REX_W;
+                IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -2693,7 +2743,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
@@ -2701,7 +2752,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (OpVT (OpNode RC:$src1,
                                      (bitconvert (memop_frag addr:$src2)))))],
-                                     itins.rm>;
+                                     itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -2967,6 +3019,7 @@ let isCodeGenOnly = 1 in {
 ///
 /// And, we have a special variant form for a full-vector intrinsic form.
 
+let Sched = WriteFSqrt in {
 def SSE_SQRTP : OpndItins<
   IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
 >;
@@ -2974,7 +3027,9 @@ def SSE_SQRTP : OpndItins<
 def SSE_SQRTS : OpndItins<
   IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
 >;
+}
 
+let Sched = WriteFRcp in {
 def SSE_RCPP : OpndItins<
   IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
 >;
@@ -2982,6 +3037,7 @@ def SSE_RCPP : OpndItins<
 def SSE_RCPS : OpndItins<
   IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
 >;
+}
 
 /// sse1_fp_unop_s - SSE1 unops in scalar form.
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
@@ -2991,24 +3047,26 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                       (ins FR32:$src1, FR32:$src2),
                       !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
   let mayLoad = 1 in {
   def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
                       (ins FR32:$src1,f32mem:$src2),
                       !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
   def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, ssmem:$src2),
                       !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
 }
 
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode FR32:$src))]>;
+                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
   // For scalar unary operations, fold a load into the operation
   // only in OptForSize mode. It eliminates an instruction, but it also
   // eliminates a whole-register clobber (the load), so it introduces a
@@ -3016,13 +3074,15 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
   def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[UseSSE1, OptForSize]>;
+            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
   def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>;
+                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
+                Sched<[itins.Sched]>;
   def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
                     !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>;
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
+                Sched<[itins.Sched.Folded]>;
 }
 
 /// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
@@ -3033,24 +3093,26 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                        (ins FR32:$src1, FR32:$src2),
                        !strconcat("v", OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                []>, VEX_4V, VEX_LIG;
+                []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
   let mayLoad = 1 in {
   def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
                       (ins FR32:$src1,f32mem:$src2),
                       !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
   def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, ssmem:$src2),
                       !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
 }
 
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode FR32:$src))]>;
+                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
   // For scalar unary operations, fold a load into the operation
   // only in OptForSize mode. It eliminates an instruction, but it also
   // eliminates a whole-register clobber (the load), so it introduces a
@@ -3058,17 +3120,17 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
   def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[UseSSE1, OptForSize]>;
+            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
   let Constraints = "$src1 = $dst" in {
     def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, VR128:$src2),
                       !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                      [], itins.rr>;
+                      [], itins.rr>, Sched<[itins.Sched]>;
     let mayLoad = 1, hasSideEffects = 0 in
     def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, ssmem:$src2),
                       !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                      [], itins.rm>;
+                      [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
 }
 
@@ -3080,30 +3142,32 @@ let Predicates = [HasAVX] in {
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
-                       itins.rr>, VEX;
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
-                       itins.rm>, VEX;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
-                        itins.rm>, VEX, VEX_L;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>;
+                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
   def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>;
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
 }
 
 /// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
@@ -3115,33 +3179,33 @@ let Predicates = [HasAVX] in {
                            !strconcat("v", OpcodeStr,
                                       "ps\t{$src, $dst|$dst, $src}"),
                            [(set VR128:$dst, (V4F32Int VR128:$src))],
-                           itins.rr>, VEX;
+                           itins.rr>, VEX, Sched<[itins.Sched]>;
   def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                           !strconcat("v", OpcodeStr,
                           "ps\t{$src, $dst|$dst, $src}"),
                           [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
-                          itins.rm>, VEX;
+                          itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                             !strconcat("v", OpcodeStr,
                                        "ps\t{$src, $dst|$dst, $src}"),
                             [(set VR256:$dst, (V8F32Int VR256:$src))],
-                            itins.rr>, VEX, VEX_L;
+                            itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
   def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
                           (ins f256mem:$src),
                           !strconcat("v", OpcodeStr,
                                     "ps\t{$src, $dst|$dst, $src}"),
                           [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
-                          itins.rm>, VEX, VEX_L;
+                          itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
   def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (V4F32Int VR128:$src))],
-                    itins.rr>;
+                    itins.rr>, Sched<[itins.Sched]>;
   def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
-                    itins.rm>;
+                    itins.rm>, Sched<[itins.Sched.Folded]>;
 }
 
 /// sse2_fp_unop_s - SSE2 unops in scalar form.
@@ -3152,35 +3216,40 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                       (ins FR64:$src1, FR64:$src2),
                       !strconcat("v", OpcodeStr,
                                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
   let mayLoad = 1 in {
   def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
                       (ins FR64:$src1,f64mem:$src2),
                       !strconcat("v", OpcodeStr,
                                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
   def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, sdmem:$src2),
                       !strconcat("v", OpcodeStr,
                                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG;
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
 }
 
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>;
+                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
+            Sched<[itins.Sched]>;
   // See the comments in sse1_fp_unop_s for why this is OptForSize.
   def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                 [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
-            Requires<[UseSSE2, OptForSize]>;
+            Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
   def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>;
+                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
+                Sched<[itins.Sched]>;
   def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
                     !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>;
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
+                Sched<[itins.Sched.Folded]>;
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -3191,30 +3260,32 @@ let Predicates = [HasAVX] in {
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
-                       itins.rr>, VEX;
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
-                       itins.rm>, VEX;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
-                        itins.rm>, VEX, VEX_L;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
               !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>;
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
   def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>;
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
 }
 
 // Square root.
@@ -3305,52 +3376,48 @@ let Predicates = [UseSSE1] in {
 //===----------------------------------------------------------------------===//
 
 let AddedComplexity = 400 in { // Prefer non-temporal versions
-  def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
-                       (ins f128mem:$dst, VR128:$src),
-                       "movntps\t{$src, $dst|$dst, $src}",
-                       [(alignednontemporalstore (v4f32 VR128:$src),
-                                                 addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
-  def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
-                       (ins f128mem:$dst, VR128:$src),
-                       "movntpd\t{$src, $dst|$dst, $src}",
-                       [(alignednontemporalstore (v2f64 VR128:$src),
-                                                 addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
-
-  let ExeDomain = SSEPackedInt in
-  def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
-                           (ins f128mem:$dst, VR128:$src),
-                           "movntdq\t{$src, $dst|$dst, $src}",
-                           [(alignednontemporalstore (v2i64 VR128:$src),
-                                                     addr:$dst)],
-                                                     IIC_SSE_MOVNT>, VEX;
-
-  def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-            (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
-
-  def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
-                       (ins f256mem:$dst, VR256:$src),
-                       "movntps\t{$src, $dst|$dst, $src}",
-                       [(alignednontemporalstore (v8f32 VR256:$src),
-                                                 addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
-  def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
-                       (ins f256mem:$dst, VR256:$src),
-                       "movntpd\t{$src, $dst|$dst, $src}",
-                       [(alignednontemporalstore (v4f64 VR256:$src),
-                                                 addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
-  let ExeDomain = SSEPackedInt in
-  def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
-                      (ins f256mem:$dst, VR256:$src),
-                      "movntdq\t{$src, $dst|$dst, $src}",
-                      [(alignednontemporalstore (v4i64 VR256:$src),
-                                                addr:$dst)],
-                                                IIC_SSE_MOVNT>, VEX, VEX_L;
-}
+let SchedRW = [WriteStore] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f32 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v2f64 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                         (ins f128mem:$dst, VR128:$src),
+                         "movntdq\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v2i64 VR128:$src),
+                                                   addr:$dst)],
+                                                   IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v8f32 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f64 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                    (ins f256mem:$dst, VR256:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4i64 VR256:$src),
+                                              addr:$dst)],
+                                              IIC_SSE_MOVNT>, VEX, VEX_L;
 
-let AddedComplexity = 400 in { // Prefer non-temporal versions
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
@@ -3366,9 +3433,6 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
                     IIC_SSE_MOVNT>;
 
-def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
-
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti{l}\t{$src, $dst|$dst, $src}",
@@ -3380,7 +3444,14 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      [(nontemporalstore (i64 GR64:$src), addr:$dst)],
                      IIC_SSE_MOVNT>,
                   TB, Requires<[HasSSE2]>;
-}
+} // SchedRW = [WriteStore]
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+          (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+} // AddedComplexity
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Prefetch and memory fence
@@ -3450,7 +3521,7 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
                     VEX;
@@ -3466,7 +3537,7 @@ def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 }
 
 // For Disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>,
@@ -3484,7 +3555,7 @@ def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
 }
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    neverHasSideEffects = 1 in {
+    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
                    VEX;
@@ -3501,7 +3572,7 @@ let Predicates = [HasAVX] in {
 }
 }
 
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i128mem:$dst, VR128:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
@@ -3520,6 +3591,7 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
 }
 }
 
+let SchedRW = [WriteMove] in {
 let neverHasSideEffects = 1 in
 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
@@ -3538,9 +3610,10 @@ def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
                        [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 }
+} // SchedRW
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    neverHasSideEffects = 1 in {
+    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
@@ -3552,7 +3625,7 @@ def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                  XS, Requires<[UseSSE2]>;
 }
 
-let mayStore = 1 in {
+let mayStore = 1, SchedRW = [WriteStore] in {
 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
@@ -3580,6 +3653,7 @@ def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
 // SSE2 - Packed Integer Arithmetic Instructions
 //===---------------------------------------------------------------------===//
 
+let Sched = WriteVecIMul in
 def SSE_PMADD : OpndItins<
   IIC_SSE_PMADD, IIC_SSE_PMADD
 >;
@@ -3598,14 +3672,15 @@ multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>;
+       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
-       itins.rm>;
+       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
@@ -3639,20 +3714,22 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
-        itins.rr>;
+        itins.rr>, Sched<[WriteVecShift]>;
   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, i128mem:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>;
+                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
+      Sched<[WriteVecShiftLd, ReadAfterLd]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
        (ins RC:$src1, i32i8imm:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>;
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>,
+       Sched<[WriteVecShift]>;
 }
 
 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
@@ -3667,14 +3744,16 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>;
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
-                                     (bitconvert (memop_frag addr:$src2)))))]>;
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -3779,7 +3858,7 @@ defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
                             VR128, v4i32, v4i32, bc_v4i32,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
-let ExeDomain = SSEPackedInt in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
   // 128-bit logical shifts.
   def VPSLLDQri : PDIi8<0x73, MRM7r,
                     (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
@@ -3825,7 +3904,7 @@ defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
                              VR256, v8i32, v4i32, bc_v4i32,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
-let ExeDomain = SSEPackedInt in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
   // 256-bit logical shifts.
   def VPSLLDQYri : PDIi8<0x73, MRM7r,
                     (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
@@ -3871,7 +3950,7 @@ defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
                            VR128, v4i32, v4i32, bc_v4i32,
                            SSE_INTSHIFT_ITINS_P>;
 
-let ExeDomain = SSEPackedInt in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
   // 128-bit logical shifts.
   def PSLLDQri : PDIi8<0x73, MRM7r,
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
@@ -3966,14 +4045,15 @@ let Predicates = [HasAVX] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      IIC_SSE_PSHUF>, VEX;
+                      IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, i8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
                        (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                        (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX;
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX,
+                  Sched<[WriteShuffleLd]>;
 }
 
 let Predicates = [HasAVX2] in {
@@ -3983,14 +4063,15 @@ let Predicates = [HasAVX2] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       IIC_SSE_PSHUF>, VEX, VEX_L;
+                       IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, i8imm:$src2),
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
                         (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
-                         (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L;
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L,
+                   Sched<[WriteShuffleLd]>;
 }
 
 let Predicates = [UseSSE2] in {
@@ -4000,14 +4081,15 @@ let Predicates = [UseSSE2] in {
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                IIC_SSE_PSHUF>;
+                IIC_SSE_PSHUF>, Sched<[WriteShuffle]>;
   def mi : Ii8<0x70, MRMSrcMem,
                (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                          (i8 imm:$src2))))], IIC_SSE_PSHUF>;
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF>,
+           Sched<[WriteShuffleLd]>;
 }
 }
 } // ExeDomain = SSEPackedInt
@@ -4043,7 +4125,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
-      IIC_SSE_UNPCK>;
+      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
   def rm : PDI<opc, MRMSrcMem,
       (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
       !if(Is2Addr,
@@ -4052,7 +4134,8 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
       [(set VR128:$dst, (OpNode VR128:$src1,
                                   (bc_frag (memopv2i64
                                                addr:$src2))))],
-                                               IIC_SSE_UNPCK>;
+                                               IIC_SSE_UNPCK>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
@@ -4060,12 +4143,14 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
   def Yrr : PDI<opc, MRMSrcReg,
       (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
       !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>;
+      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+      Sched<[WriteShuffle]>;
   def Yrm : PDI<opc, MRMSrcMem,
       (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
       !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
       [(set VR256:$dst, (OpNode VR256:$src1,
-                                  (bc_frag (memopv4i64 addr:$src2))))]>;
+                                  (bc_frag (memopv4i64 addr:$src2))))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -4142,7 +4227,8 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
-         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>;
+         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>,
+       Sched<[WriteShuffle]>;
   def rmi : Ii8<0xC4, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1,
                         i16mem:$src2, i32i8imm:$src3),
@@ -4151,7 +4237,8 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
          (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
-                    imm:$src3))], IIC_SSE_PINSRW>;
+                    imm:$src3))], IIC_SSE_PINSRW>,
+       Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 // Extract
@@ -4160,12 +4247,14 @@ def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))]>, TB, OpSize, VEX;
+                                                imm:$src2))]>, TB, OpSize, VEX,
+                Sched<[WriteShuffle]>;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))], IIC_SSE_PEXTRW>;
+                                                imm:$src2))], IIC_SSE_PEXTRW>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
 
 // Insert
 let Predicates = [HasAVX] in {
@@ -4173,7 +4262,7 @@ let Predicates = [HasAVX] in {
   def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
        "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-       []>, TB, OpSize, VEX_4V;
+       []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>;
 }
 
 let Constraints = "$src1 = $dst" in
@@ -4185,7 +4274,7 @@ let Constraints = "$src1 = $dst" in
 // SSE2 - Packed Mask Creation
 //===---------------------------------------------------------------------===//
 
-let ExeDomain = SSEPackedInt in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
 
 def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
@@ -4213,7 +4302,7 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
 // SSE2 - Conditional Store
 //===---------------------------------------------------------------------===//
 
-let ExeDomain = SSEPackedInt in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
 
 let Uses = [EDI] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
@@ -4252,41 +4341,42 @@ def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
-                        VEX;
+                        VEX, Sched<[WriteMove]>;
 def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>,
-                      VEX;
+                      VEX, Sched<[WriteLoad]>;
 def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
-                          IIC_SSE_MOVDQ>, VEX;
+                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
-                       IIC_SSE_MOVDQ>, VEX;
+                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>;
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                  Sched<[WriteMove]>;
 def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
-                        IIC_SSE_MOVDQ>;
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
-                          IIC_SSE_MOVDQ>;
+                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
-                       IIC_SSE_MOVDQ>;
+                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
@@ -4294,22 +4384,22 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, VEX;
+                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 
 def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
                       IIC_SSE_MOVDQ>,
-                      VEX;
+                      VEX, Sched<[WriteLoad]>;
 def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>;
+                      IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 
 def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>;
+                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
@@ -4317,26 +4407,29 @@ def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
 def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
-                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX;
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+                    Sched<[WriteMove]>;
 def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
-                                     VEX;
+                                     VEX, Sched<[WriteLoad]>;
 def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
-                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>;
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                   Sched<[WriteMove]>;
 def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)],
-                                     IIC_SSE_MOVDQ>;
+                                     IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
+let SchedRW = [WriteMove] in {
 def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "vmov{d|q}\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
@@ -4349,6 +4442,7 @@ def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                          (iPTR 0)))],
                                                          IIC_SSE_MOVD_ToGP>;
+} //SchedRW
 
 //===---------------------------------------------------------------------===//
 // Bitcast FR64 <-> GR64
@@ -4357,28 +4451,28 @@ let Predicates = [HasAVX] in
 def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
-                        VEX;
+                        VEX, Sched<[WriteLoad]>;
 def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "mov{d|q}\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
-                         IIC_SSE_MOVDQ>, VEX;
+                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          "movq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, VEX;
+                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
 
 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
-                       IIC_SSE_MOVDQ>;
+                       IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set GR64:$dst, (bitconvert FR64:$src))],
-                       IIC_SSE_MOVD_ToGP>;
+                       IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
 def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                       IIC_SSE_MOVDQ>;
+                       IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 
 //===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index ad55058..a37a8cc 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -22,7 +22,7 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
 
 let isBranch = 1, isTerminator = 1, Defs = [EAX] in
 def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
-                         "xbegin\t$dst", []>;
+                         "xbegin\t$dst", []>, Requires<[HasRTM]>;
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
              "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 3af1b3e..a8a9fd8 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -407,6 +407,57 @@ ReSimplify:
     LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
     break;
 
+  // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+  // if one of the registers is extended, but other isn't.
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
+      case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+      case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
+      case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+      case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
+      case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+      case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
+      case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+      case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
+      case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+      case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
+      case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVSDrr:   NewOpc = X86::VMOVSDrr_REV;   break;
+      case X86::VMOVSSrr:   NewOpc = X86::VMOVSSrr_REV;   break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+
   // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
   // inputs modeled as normal uses instead of implicit uses.  As such, truncate
   // off all but the first operand (the callee).  FIXME: Change isel.
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index d99d085..da0ca7d 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -7,6 +7,78 @@
 //
 //===----------------------------------------------------------------------===//
 
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+  // The SchedWrite to use when a load is folded into the instruction.
+  SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+  // Register-Memory operation.
+  def Ld : SchedWrite;
+  // Register-Register operation.
+  def NAME : X86FoldableSchedWrite {
+    let Folded = !cast<SchedWrite>(NAME#"Ld");
+  }
+}
+
+// Arithmetic.
+defm WriteALU  : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def  WriteLEA  : SchedWrite;        // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad  : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove  : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd  : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul  : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv  : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp  : X86SchedWritePair; // Floating point reciprocal.
+
+// Vector integer operations.
+defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+defm WriteShuffle  : X86SchedWritePair; // Vector shuffles and blends.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for X86
 def IIC_DEFAULT     : InstrItinClass;
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index fefb479..be2a997 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,18 +176,42 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
     { ISD::MUL,     MVT::v8i32,    4 },
     { ISD::SUB,     MVT::v8i32,    4 },
     { ISD::ADD,     MVT::v8i32,    4 },
-    { ISD::MUL,     MVT::v4i64,    4 },
     { ISD::SUB,     MVT::v4i64,    4 },
     { ISD::ADD,     MVT::v4i64,    4 },
-    };
+    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+    // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+    // Because we believe v4i64 to be a legal type, we must also include the
+    // split factor of two in the cost table. Therefore, the cost here is 18
+    // instead of 9.
+    { ISD::MUL,     MVT::v4i64,    18 },
+  };
 
   // Look for AVX1 lowering tricks.
-  if (ST->hasAVX()) {
-    int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
-                          LT.second);
+  if (ST->hasAVX() && !ST->hasAVX2()) {
+    int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
+                                   ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX1CostTable[Idx].Cost;
   }
+
+  // Custom lowering of vectors.
+  static const CostTblEntry<MVT> CustomLowered[] = {
+    // A v2i64/v4i64 and multiply is custom lowered as a series of long
+    // multiplies(3), shifts(4) and adds(2).
+    { ISD::MUL,     MVT::v2i64,    9 },
+    { ISD::MUL,     MVT::v4i64,    9 },
+  };
+  int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
+                                 ISD, LT.second);
+  if (Idx != -1)
+    return LT.first * CustomLowered[Idx].Cost;
+
+  // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+  // 2x pmuludq, 2x shuffle.
+  if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+      !ST->hasSSE41())
+    return 6;
+
   // Fallback to the default implementation.
   return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty);
 }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index f8a9125..a5d2be8 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -84,7 +84,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // XCore does not have the NodeTypes below.
-  setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
   setOperationAction(ISD::ADDC, MVT::i32, Expand);
   setOperationAction(ISD::ADDE, MVT::i32, Expand);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 6d430ef..8d258f5 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -84,7 +84,7 @@ namespace llvm {
     explicit XCoreTargetLowering(XCoreTargetMachine &TM);
 
     virtual unsigned getJumpTableEncoding() const;
-    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;