Update LLVM for 3.5 rebase (r209712).

Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
author: Stephen Hines <srhines@google.com> 2014-05-29 02:49:00 -0700
committer: Stephen Hines <srhines@google.com> 2014-05-29 02:49:00 -0700
commit: dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
tree: dcebc53f2b182f145a2e659393bf9a0472cedf23 /lib/Target/R600
parent: 220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
download: external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2
77 files changed, 3148 insertions, 1599 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 3e1848b..949fdfb 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -37,11 +37,15 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 // SI Passes
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
 
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
@@ -76,8 +80,8 @@ enum AddressSpaces {
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
   CONSTANT_ADDRESS = 2, ///< Address space for constant memory
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  REGION_ADDRESS   = 4, ///< Address space for region memory.
-  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
   PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
   PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
 
@@ -102,7 +106,8 @@ enum AddressSpaces {
   CONSTANT_BUFFER_13 = 21,
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
-  LAST_ADDRESS     = 24
+  ADDRESS_NONE = 24, ///< Address space for unknown memory.
+  LAST_ADDRESS = ADDRESS_NONE
 };
 
 } // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index d1e2cf5..2edc115 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -120,6 +120,17 @@ def AMDGPU : Target {
   let InstructionSet = AMDGPUInstrInfo;
 }
 
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+class PredicateControl {
+  Predicate SubtargetPredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate],
+                                            OtherPredicates);
+}
+
 // Include AMDGPU TD files
 include "R600Schedule.td"
 include "SISchedule.td"
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index b166c45..170f479 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -64,7 +64,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR);
+    getSIProgramInfo(KernelInfo, MF);
     EmitProgramInfoSI(MF, KernelInfo);
   } else {
     EmitProgramInfoR600(MF);
@@ -84,8 +84,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                               SectionKind::getReadOnly());
     OutStreamer.SwitchSection(CommentSection);
 
-    if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       OutStreamer.emitRawComment(" Kernel info:", false);
+      OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+                                 false);
       OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
                                  false);
       OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
@@ -184,9 +186,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
   }
 }
 
-void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
-                                              unsigned &NumSGPR,
-                                              unsigned &NumVGPR) const {
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        MachineFunction &MF) const {
+  uint64_t CodeSize = 0;
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
@@ -200,6 +202,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
                                                     I != E; ++I) {
       MachineInstr &MI = *I;
 
+      // TODO: CodeSize should account for multiple functions.
+      CodeSize += MI.getDesc().Size;
+
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
         MachineOperand &MO = MI.getOperand(op_idx);
@@ -274,13 +279,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
   if (VCCUsed)
     MaxSGPR += 2;
 
-  NumSGPR = MaxSGPR;
-  NumVGPR = MaxVGPR;
-}
-
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out,
-                                        MachineFunction &MF) const {
-  findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR);
+  ProgInfo.CodeLen = CodeSize;
+  ProgInfo.NumSGPR = MaxSGPR;
+  ProgInfo.NumVGPR = MaxVGPR;
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index a2b8337..71adc9a 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -24,7 +24,12 @@ namespace llvm {
 class AMDGPUAsmPrinter : public AsmPrinter {
 private:
   struct SIProgramInfo {
-    SIProgramInfo() : NumSGPR(0), NumVGPR(0) {}
+    SIProgramInfo() :
+      CodeLen(0),
+      NumSGPR(0),
+      NumVGPR(0) {}
+
+    uint64_t CodeLen;
     unsigned NumSGPR;
     unsigned NumVGPR;
   };
@@ -42,14 +47,14 @@ private:
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "AMDGPU Assembly Printer";
   }
 
   /// Implemented in AMDGPUMCInstLower.cpp
-  virtual void EmitInstruction(const MachineInstr *MI);
+  void EmitInstruction(const MachineInstr *MI) override;
 
 protected:
   bool DisasmEnabled;
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 65cdb24..5f8ad8c 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -20,7 +20,7 @@ def CC_SI : CallingConv<[
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
index 50297d1..91aeee2 100644
--- a/lib/Target/R600/AMDGPUConvertToISA.cpp
+++ b/lib/Target/R600/AMDGPUConvertToISA.cpp
@@ -31,9 +31,9 @@ public:
   AMDGPUConvertToISAPass(TargetMachine &tm) :
     MachineFunctionPass(ID), TM(tm) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
+  const char *getPassName() const override {return "AMDGPU Convert to ISA";}
 
 };
 
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 0325a00..e7e90d3 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -97,7 +97,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
 const TargetFrameLowering::SpillSlot *
 AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
   NumEntries = 0;
-  return 0;
+  return nullptr;
 }
 void
 AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
index cf5742e..d18ede5 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.h
+++ b/lib/Target/R600/AMDGPUFrameLowering.h
@@ -33,12 +33,13 @@ public:
 
   /// \returns The number of 32-bit sub-registers that are used when storing
   /// values to the stack.
-  virtual unsigned getStackWidth(const MachineFunction &MF) const;
-  virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-  virtual bool hasFP(const MachineFunction &MF) const;
+  unsigned getStackWidth(const MachineFunction &MF) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  const SpillSlot *
+    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
 };
 } // namespace llvm
 #endif // AMDILFRAME_LOWERING_H
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index e8c5f5b..f1f0bfa 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -16,15 +16,11 @@
 #include "AMDGPURegisterInfo.h"
 #include "R600InstrInfo.h"
 #include "SIISelLowering.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/Compiler.h"
-#include <list>
-#include <queue>
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
@@ -43,11 +39,12 @@ public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
 
-  SDNode *Select(SDNode *N);
-  virtual const char *getPassName() const;
-  virtual void PostprocessISelDAG();
+  SDNode *Select(SDNode *N) override;
+  const char *getPassName() const override;
+  void PostprocessISelDAG() override;
 
 private:
+  bool isInlineImmediate(SDNode *N) const;
   inline SDValue getSmallIPtrImm(unsigned Imm);
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                    const R600InstrInfo *TII);
@@ -58,11 +55,9 @@ private:
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
   bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
   bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
-  SDValue SimplifyI24(SDValue &Op);
-  bool SelectI24(SDValue Addr, SDValue &Op);
-  bool SelectU24(SDValue Addr, SDValue &Op);
 
   static bool checkType(const Value *ptr, unsigned int addrspace);
+  static bool checkPrivateAddress(const MachineMemOperand *Op);
 
   static bool isGlobalStore(const StoreSDNode *N);
   static bool isPrivateStore(const StoreSDNode *N);
@@ -77,10 +72,15 @@ private:
   bool isLocalLoad(const LoadSDNode *N) const;
   bool isRegionLoad(const LoadSDNode *N) const;
 
+  /// \returns True if the current basic block being selected is at control
+  ///          flow depth 0.  Meaning that the current block dominates the
+  //           exit block.
+  bool isCFDepth0() const;
+
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr,
-      SDValue &BaseReg, SDValue& Offset);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
 
@@ -91,8 +91,7 @@ private:
 
 /// \brief This pass converts a legalized DAG into a AMDGPU-specific
 // DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
-                                       ) {
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
   return new AMDGPUDAGToDAGISel(TM);
 }
 
@@ -103,32 +102,39 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
 AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
 }
 
+bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
+  const SITargetLowering *TL
+      = static_cast<const SITargetLowering *>(getTargetLowering());
+  return TL->analyzeImmediate(N) == 0;
+}
+
 /// \brief Determine the register class for \p OpNo
 /// \returns The register class of the virtual register that will be used for
 /// the given operand number \OpNo or NULL if the register class cannot be
 /// determined.
 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
                                                           unsigned OpNo) const {
-  if (!N->isMachineOpcode()) {
-    return NULL;
-  }
+  if (!N->isMachineOpcode())
+    return nullptr;
+
   switch (N->getMachineOpcode()) {
   default: {
     const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
     unsigned OpIdx = Desc.getNumDefs() + OpNo;
     if (OpIdx >= Desc.getNumOperands())
-      return NULL;
+      return nullptr;
     int RegClass = Desc.OpInfo[OpIdx].RegClass;
-    if (RegClass == -1) {
-      return NULL;
-    }
+    if (RegClass == -1)
+      return nullptr;
+
     return TM.getRegisterInfo()->getRegClass(RegClass);
   }
   case AMDGPU::REG_SEQUENCE: {
-    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(
-                      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
-    unsigned SubRegIdx =
-            dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue();
+    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID);
+
+    SDValue SubRegOp = N->getOperand(OpNo + 1);
+    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
     return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
   }
   }
@@ -139,7 +145,7 @@ SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
 }
 
 bool AMDGPUDAGToDAGISel::SelectADDRParam(
-    SDValue Addr, SDValue& R1, SDValue& R2) {
+  SDValue Addr, SDValue& R1, SDValue& R2) {
 
   if (Addr.getOpcode() == ISD::FrameIndex) {
     if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
@@ -196,15 +202,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   switch (Opc) {
   default: break;
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
   case ISD::ADD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (N->getValueType(0) != MVT::i64 ||
         ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
@@ -232,12 +239,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     AddLoArgs.push_back(SDValue(Lo0, 0));
     AddLoArgs.push_back(SDValue(Lo1, 0));
 
-    SDNode *AddLo = CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL,
-                                           VTList, AddLoArgs);
+    SDNode *AddLo = CurDAG->getMachineNode(
+        isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32,
+        DL, VTList, AddLoArgs);
     SDValue Carry = SDValue(AddLo, 1);
-    SDNode *AddHi = CurDAG->getMachineNode(AMDGPU::S_ADDC_U32, DL,
-                                           MVT::i32, SDValue(Hi0, 0),
-                                           SDValue(Hi1, 0), Carry);
+    SDNode *AddHi = CurDAG->getMachineNode(
+        isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32,
+        DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
 
     SDValue Args[5] = {
       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
@@ -246,11 +254,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SDValue(AddHi,0),
       Sub1,
     };
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
   }
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     const AMDGPURegisterInfo *TRI =
                    static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
     const SIRegisterInfo *SIRI =
@@ -316,7 +323,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     // 16 = Max Num Vector Elements
     // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
     // 1 = Vector Register Class
-    SDValue RegSeqArgs[16 * 2 + 1];
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1);
 
     RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
@@ -333,11 +340,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     if (!IsRegSeq)
       break;
     return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
-        RegSeqArgs, 2 * N->getNumOperands() + 1);
+                                RegSeqArgs);
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
@@ -346,7 +352,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
     } else if (N->getValueType(0) == MVT::i64) {
-      RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
     } else {
@@ -357,8 +363,37 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                   SDLoc(N), N->getValueType(0), Ops);
   }
-  case AMDGPUISD::REGISTER_LOAD: {
+
+  case ISD::Constant:
+  case ISD::ConstantFP: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+      break;
+
+    uint64_t Imm;
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+    else {
+      ConstantSDNode *C = cast<ConstantSDNode>(N);
+      Imm = C->getZExtValue();
+    }
+
+    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm & 0xFFFFFFFF, MVT::i32));
+    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm >> 32, MVT::i32));
+    const SDValue Ops[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N),
+                                  N->getValueType(0), Ops);
+  }
+
+  case AMDGPUISD::REGISTER_LOAD: {
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -375,7 +410,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   Ops);
   }
   case AMDGPUISD::REGISTER_STORE: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -391,42 +425,95 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                         CurDAG->getVTList(MVT::Other),
                                         Ops);
   }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    // Transformation function, pack the offset and width of a BFE into
+    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+    // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                                  SDLoc(N),
+                                  MVT::i32,
+                                  N->getOperand(0),
+                                  PackedOffsetWidth);
+
+  }
   }
   return SelectCode(N);
 }
 
 
-bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
-  if (!ptr) {
+bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
+  assert(AS != 0 && "Use checkPrivateAddress instead.");
+  if (!Ptr)
     return false;
-  }
-  Type *ptrType = ptr->getType();
-  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+
+  return Ptr->getType()->getPointerAddressSpace() == AS;
+}
+
+bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
+  if (Op->getPseudoValue())
+    return true;
+
+  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
+    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
+  const Value *MemVal = N->getMemOperand()->getValue();
+  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
 }
 
 bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
-  if (CbId == -1) {
-    return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS);
-  }
-  return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (CbId == -1)
+    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
 }
 
 bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
@@ -437,27 +524,26 @@ bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
       return true;
     }
   }
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
   MachineMemOperand *MMO = N->getMemOperand();
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+  if (checkPrivateAddress(N->getMemOperand())) {
     if (MMO) {
-      const Value *V = MMO->getValue();
-      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      const PseudoSourceValue *PSV = MMO->getPseudoValue();
       if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
         return true;
       }
@@ -467,24 +553,34 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
 }
 
 bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+  if (checkPrivateAddress(N->getMemOperand())) {
     // Check to make sure we are not a constant pool load or a constant load
     // that is marked as a private load
     if (isCPLoad(N) || isConstantLoad(N, -1)) {
       return false;
     }
   }
-  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
+
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
     return true;
   }
   return false;
 }
 
+bool AMDGPUDAGToDAGISel::isCFDepth0() const {
+  // FIXME: Figure out a way to use DominatorTree analysis here.
+  const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock();
+  const Function *Fn = FuncInfo->Fn;
+  return &Fn->front() == CurBlock || &Fn->back() == CurBlock;
+}
+
+
 const char *AMDGPUDAGToDAGISel::getPassName() const {
   return "AMDGPU DAG->DAG Pattern Instruction Selection";
 }
@@ -499,7 +595,7 @@ const char *AMDGPUDAGToDAGISel::getPassName() const {
 //===----------------------------------------------------------------------===//
 
 bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-    SDValue& IntPtr) {
+                                                         SDValue& IntPtr) {
   if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
     IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
     return true;
@@ -509,7 +605,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
 
 bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
     SDValue& BaseReg, SDValue &Offset) {
-  if (!dyn_cast<ConstantSDNode>(Addr)) {
+  if (!isa<ConstantSDNode>(Addr)) {
     BaseReg = Addr;
     Offset = CurDAG->getIntPtrConstant(0, true);
     return true;
@@ -519,7 +615,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
 
 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                                            SDValue &Offset) {
-  ConstantSDNode * IMMOffset;
+  ConstantSDNode *IMMOffset;
 
   if (Addr.getOpcode() == ISD::ADD
       && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
@@ -563,52 +659,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
-SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) {
-  APInt Demanded = APInt(32, 0x00FFFFFF);
-  APInt KnownZero, KnownOne;
-  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true);
-  const TargetLowering *TLI = getTargetLowering();
-  if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) {
-    CurDAG->ReplaceAllUsesWith(Op, TLO.New);
-    CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode());
-    return SimplifyI24(TLO.New);
-  } else {
-    return  Op;
-  }
-}
-
-bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) {
-
-  assert(Op.getValueType() == MVT::i32);
-
-  if (CurDAG->ComputeNumSignBits(Op) == 9) {
-    I24 = SimplifyI24(Op);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
-  APInt KnownZero;
-  APInt KnownOne;
-  CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
-
-  assert (Op.getValueType() == MVT::i32);
-
-  // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than
-  // i32.  These smaller types are legal to use with the i24 instructions.
-  if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 ||
-       Op.getOpcode() == ISD::ANY_EXTEND ||
-       ISD::isEXTLoad(Op.getNode())) {
-    U24 = SimplifyI24(Op);
-    return true;
-  }
-  return false;
-}
-
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
-    (*(const AMDGPUTargetLowering*)getTargetLowering());
+    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
   bool IsModified = false;
   do {
     IsModified = false;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 183725c..6c443ea 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -28,8 +28,50 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 
 using namespace llvm;
+
+namespace {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  const Function &Fn;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+                          DiagnosticSeverity Severity = DS_Error)
+    : DiagnosticInfo(getKindID(), Severity),
+      Description(Desc),
+      Fn(Fn) { }
+
+  const Function &getFunction() const { return Fn; }
+  const Twine &getDescription() const { return Description; }
+
+  void print(DiagnosticPrinter &DP) const override {
+    DP << "unsupported " << getDescription() << " in " << Fn.getName();
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+}
+
+
 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo,
                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -88,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+
   // Custom lowering of vector stores is required for local address space
   // stores.
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
@@ -103,6 +148,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   // handle 64-bit stores.
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
 
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
   setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
@@ -126,6 +173,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::f64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
@@ -152,15 +202,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
 
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
   setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
 
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
   setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::SUB, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
@@ -168,10 +222,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   static const MVT::SimpleValueType IntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
-  const size_t NumIntTypes = array_lengthof(IntTypes);
 
-  for (unsigned int x  = 0; x < NumIntTypes; ++x) {
-    MVT::SimpleValueType VT = IntTypes[x];
+  for (MVT VT : IntTypes) {
     //Expand the following operations for the current type by default
     setOperationAction(ISD::ADD,  VT, Expand);
     setOperationAction(ISD::AND,  VT, Expand);
@@ -195,12 +247,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   static const MVT::SimpleValueType FloatTypes[] = {
     MVT::v2f32, MVT::v4f32
   };
-  const size_t NumFloatTypes = array_lengthof(FloatTypes);
 
-  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
-    MVT::SimpleValueType VT = FloatTypes[x];
+  for (MVT VT : FloatTypes) {
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
@@ -208,25 +259,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FMUL, VT, Expand);
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
   }
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT_CC);
 }
 
 //===----------------------------------------------------------------------===//
@@ -325,6 +364,25 @@ SDValue AMDGPUTargetLowering::LowerReturn(
 // Target specific lowering
 //===---------------------------------------------------------------------===//
 
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                        SmallVectorImpl<SDValue> &InVals) const {
+  SDValue Callee = CLI.Callee;
+  SelectionDAG &DAG = CLI.DAG;
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  StringRef FuncName("<unknown>");
+
+  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+    FuncName = G->getSymbol();
+  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    FuncName = G->getGlobal()->getName();
+
+  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+  DAG.getContext()->diagnose(NoCalls);
+  return SDValue();
+}
+
 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     const {
   switch (Op.getOpcode()) {
@@ -361,12 +419,111 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
     // nothing here and let the illegal result integer be handled normally.
     return;
+  case ISD::UDIV: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM);
+    break;
+  }
+  case ISD::UREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM.getValue(1));
+    break;
+  }
+  case ISD::UDIVREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    SDValue one = DAG.getConstant(1, HalfVT);
+    SDValue zero = DAG.getConstant(0, HalfVT);
+
+    //HiLo split
+    SDValue LHS = N->getOperand(0);
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+    SDValue RHS = N->getOperand(1);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+    // Get Speculative values
+    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+    SDValue REM_Hi = zero;
+    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+
+    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+    SDValue DIV_Lo = zero;
+
+    const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+    for (unsigned i = 0; i < halfBitWidth; ++i) {
+      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+      // Get Value of high bit
+      SDValue HBit;
+      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+      } else {
+        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+      }
+
+      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+        DAG.getConstant(halfBitWidth - 1, HalfVT));
+      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
+
+      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+
+
+      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+
+      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
+
+      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+      // Update REM
+
+      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+
+      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
+      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+    }
 
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    break;
+  }
   default:
     return;
   }
 }
 
+// FIXME: This implements accesses to initialized globals in the constant
+// address space by copying them to private and accessing that. It does not
+// properly handle illegal types or vectors. The private vector loads are not
+// scalarized, and the illegal scalars hit an assertion. This technique will not
+// work well with large initializers, and this should eventually be
+// removed. Initialized globals should be placed into a data section that the
+// runtime will load into a buffer before the kernel is executed. Uses of the
+// global need to be replaced with a pointer loaded from an implicit kernel
+// argument into this buffer holding the copy of the data, which will remove the
+// need for any of this.
 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        const GlobalValue *GV,
                                                        const SDValue &InitPtr,
@@ -380,29 +537,60 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     return DAG.getStore(Chain, DL,  DAG.getConstant(*CI, VT), InitPtr,
                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                  TD->getPrefTypeAlignment(CI->getType()));
-  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
     EVT VT = EVT::getEVT(CFP->getType());
     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                  TD->getPrefTypeAlignment(CFP->getType()));
-  } else if (Init->getType()->isAggregateType()) {
+  }
+
+  Type *InitTy = Init->getType();
+  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
+    const StructLayout *SL = TD->getStructLayout(ST);
+
     EVT PtrVT = InitPtr.getValueType();
-    unsigned NumElements = Init->getType()->getArrayNumElements();
+    SmallVector<SDValue, 8> Chains;
+
+    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
+      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(I);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
+    EVT PtrVT = InitPtr.getValueType();
+
+    unsigned NumElements;
+    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
+      NumElements = AT->getNumElements();
+    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
+      NumElements = VT->getNumElements();
+    else
+      llvm_unreachable("Unexpected type");
+
+    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
     SmallVector<SDValue, 8> Chains;
     for (unsigned i = 0; i < NumElements; ++i) {
-      SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
-          Init->getType()->getArrayElementType()), PtrVT);
+      SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-      Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
-                       GV, Ptr, Chain, DAG));
+
+      Constant *Elt = Init->getAggregateElement(i);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
     }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
-                       Chains.size());
-  } else {
-    Init->dump();
-    llvm_unreachable("Unhandled constant initializer");
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
+
+  Init->dump();
+  llvm_unreachable("Unhandled constant initializer");
 }
 
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
@@ -440,7 +628,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     unsigned Size = TD->getTypeAllocSize(EltType);
     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
 
-    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+    const GlobalVariable *Var = cast<GlobalVariable>(GV);
     const Constant *Init = Var->getInitializer();
     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
     SDValue InitPtr = DAG.getFrameIndex(FI,
@@ -461,7 +649,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
       for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
         Ops.push_back((*I)->getOperand(i));
       }
-      DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size());
+      DAG.UpdateNodeOperands(*I, Ops);
     }
     return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
         getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
@@ -469,44 +657,28 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   }
 }
 
-void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
-                                         SmallVectorImpl<SDValue> &Args,
-                                         unsigned Start,
-                                         unsigned Count) const {
-  EVT VT = Op.getValueType();
-  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
-    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
-                               VT.getVectorElementType(),
-                               Op, DAG.getConstant(i, MVT::i32)));
-  }
-}
-
 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> Args;
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
-  ExtractVectorElements(A, DAG, Args, 0,
-                        A.getValueType().getVectorNumElements());
-  ExtractVectorElements(B, DAG, Args, 0,
-                        B.getValueType().getVectorNumElements());
+  DAG.ExtractVectorElements(A, Args);
+  DAG.ExtractVectorElements(B, Args);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
-                     &Args[0], Args.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
 
   SmallVector<SDValue, 8> Args;
-  EVT VT = Op.getValueType();
   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  ExtractVectorElements(Op.getOperand(0), DAG, Args, Start,
-                        VT.getVectorNumElements());
+  EVT VT = Op.getValueType();
+  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+                            VT.getVectorNumElements());
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
-                     &Args[0], Args.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
@@ -560,6 +732,22 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
 
+    case AMDGPUIntrinsic::AMDGPU_umul24:
+      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_imul24:
+      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_umad24:
+      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_imad24:
+      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                          Op.getOperand(1),
@@ -590,8 +778,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
 ///IABS(a) = SMAX(sub(0, a), a)
 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
-    SelectionDAG &DAG) const {
-
+                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
@@ -603,7 +790,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
 /// Linear Interpolation
 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
-    SelectionDAG &DAG) const {
+                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
@@ -617,16 +804,16 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
 }
 
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
-    SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
+SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
 
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue True = Op.getOperand(2);
-  SDValue False = Op.getOperand(3);
-  SDValue CC = Op.getOperand(4);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue True = N->getOperand(2);
+  SDValue False = N->getOperand(3);
+  SDValue CC = N->getOperand(4);
 
   if (VT != MVT::f32 ||
       !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
@@ -654,10 +841,8 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   case ISD::SETOLT:
   case ISD::SETLE:
   case ISD::SETLT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
-    else
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETGT:
   case ISD::SETGE:
@@ -665,15 +850,13 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   case ISD::SETOGE:
   case ISD::SETUGT:
   case ISD::SETOGT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
-    else
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETCC_INVALID:
     llvm_unreachable("Invalid setcc condcode!");
   }
-  return Op;
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
@@ -695,8 +878,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
                         MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
                         Load->getAlignment()));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(),
-                     Loads.data(), Loads.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
 }
 
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
@@ -713,32 +895,46 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   }
 
   SDLoc DL(Op);
-  const SDValue &Value = Store->getValue();
+  SDValue Value = Store->getValue();
   EVT VT = Value.getValueType();
-  const SDValue &Ptr = Store->getBasePtr();
+  EVT ElemVT = VT.getVectorElementType();
+  SDValue Ptr = Store->getBasePtr();
   EVT MemEltVT = MemVT.getVectorElementType();
   unsigned MemEltBits = MemEltVT.getSizeInBits();
   unsigned MemNumElements = MemVT.getVectorNumElements();
-  EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
-  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT);
+  unsigned PackedSize = MemVT.getStoreSizeInBits();
+  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
+
+  assert(Value.getValueType().getScalarSizeInBits() >= 32);
 
   SDValue PackedValue;
   for (unsigned i = 0; i < MemNumElements; ++i) {
-    EVT ElemVT = VT.getVectorElementType();
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
                               DAG.getConstant(i, MVT::i32));
-    Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
-    Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
-    SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
-    Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
+    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
+
+    SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
+    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
+
     if (i == 0) {
       PackedValue = Elt;
     } else {
-      PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
     }
   }
+
+  if (PackedSize < 32) {
+    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
+    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
+                             Store->getMemOperand()->getPointerInfo(),
+                             PackedVT,
+                             Store->isNonTemporal(), Store->isVolatile(),
+                             Store->getAlignment());
+  }
+
   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
-                      MachinePointerInfo(Store->getMemOperand()->getValue()),
+                      Store->getMemOperand()->getPointerInfo(),
                       Store->isVolatile(),  Store->isNonTemporal(),
                       Store->getAlignment());
 }
@@ -766,7 +962,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                          MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
                          Store->getAlignment()));
   }
-  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
 }
 
 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -788,9 +984,24 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
   }
 
+  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
+    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
+
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+  }
+
   // Lower loads constant address space global variable loads
   if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) {
+      isa<GlobalVariable>(
+          GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
 
     SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
@@ -887,15 +1098,13 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
-    SelectionDAG &DAG) const {
+                                           SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Num = Op.getOperand(0);
   SDValue Den = Op.getOperand(1);
 
-  SmallVector<SDValue, 8> Results;
-
   // RCP =  URECIP(Den) = 2^32 / Den + e
   // e is rounding error.
   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -985,10 +1194,11 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
                             Remainder_A_Den, Rem, ISD::SETEQ);
-  SDValue Ops[2];
-  Ops[0] = Div;
-  Ops[1] = Rem;
-  return DAG.getMergeValues(Ops, 2, DL);
+  SDValue Ops[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -1029,81 +1239,197 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   MVT ScalarVT = VT.getScalarType();
 
-  unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits();
-  unsigned DestBits = ScalarVT.getSizeInBits();
-  unsigned BitsDiff = DestBits - SrcBits;
-
-  if (!Subtarget->hasBFE())
-    return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+  if (!VT.isVector())
+    return SDValue();
 
   SDValue Src = Op.getOperand(0);
-  if (VT.isVector()) {
-    SDLoc DL(Op);
-    // Need to scalarize this, and revisit each of the scalars later.
-    // TODO: Don't scalarize on Evergreen?
-    unsigned NElts = VT.getVectorNumElements();
-    SmallVector<SDValue, 8> Args;
-    ExtractVectorElements(Src, DAG, Args, 0, NElts);
+  SDLoc DL(Op);
 
-    SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
-    for (unsigned I = 0; I < NElts; ++I)
-      Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+  // TODO: Don't scalarize on Evergreen?
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Args;
+  DAG.ExtractVectorElements(Src, Args, 0, NElts);
 
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size());
-  }
+  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+  for (unsigned I = 0; I < NElts; ++I)
+    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
 
-  if (SrcBits == 32) {
-    SDLoc DL(Op);
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+}
 
-    // If the source is 32-bits, this is really half of a 2-register pair, and
-    // we need to discard the unused half of the pair.
-    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc);
-  }
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
 
-  unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1;
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+  APInt KnownZero, KnownOne;
+  EVT VT = Op.getValueType();
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
 
-  // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
-  // might not be worth the effort, and will need to expand to shifts when
-  // fixing SGPR copies.
-  if (SrcBits < 32 && DestBits <= 32) {
-    SDLoc DL(Op);
-    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
-
-    if (DestBits != 32)
-      Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src);
-
-    // FIXME: This should use TargetConstant, but that hits assertions for
-    // Evergreen.
-    SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT,
-                              Op.getOperand(0), // Operand
-                              DAG.getConstant(0, ExtVT), // Offset
-                              DAG.getConstant(SrcBits, ExtVT)); // Width
-
-    // Truncate to the original type if necessary.
-    if (ScalarVT == MVT::i32)
-      return Ext;
-    return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext);
-  }
+  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
 
-  // For small types, extend to 32-bits first.
-  if (SrcBits < 32) {
-    SDLoc DL(Op);
-    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
 
-    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src);
-    SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32,
-                                DL,
-                                ExtVT,
-                                TruncSrc, // Operand
-                                DAG.getConstant(0, ExtVT), // Offset
-                                DAG.getConstant(SrcBits, ExtVT)); // Width
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+                                     // as unsigned 24-bit values.
+         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = Op.getValueType();
+
+  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    DCI.CommitTargetLoweringOpt(TLO);
+}
 
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32);
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
+                               uint32_t Offset, uint32_t Width) {
+  if (Width + Offset < 32) {
+    IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+    return DAG.getConstant(Result, MVT::i32);
   }
 
-  // For everything else, use the standard bitshift expansion.
-  return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+  return DAG.getConstant(Src0 >> Offset, MVT::i32);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch(N->getOpcode()) {
+    default: break;
+    case ISD::MUL: {
+      EVT VT = N->getValueType(0);
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      SDValue Mul;
+
+      // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
+      if (VT.isVector() || VT.getSizeInBits() > 32)
+        break;
+
+      if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+        N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+        N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+        Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+      } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+        N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+        N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+        Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+      } else {
+        break;
+      }
+
+      // We need to use sext even for MUL_U24, because MUL_U24 is used
+      // for signed multiply of 8 and 16-bit types.
+      SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
+
+      return Reg;
+    }
+    case AMDGPUISD::MUL_I24:
+    case AMDGPUISD::MUL_U24: {
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      simplifyI24(N0, DCI);
+      simplifyI24(N1, DCI);
+      return SDValue();
+    }
+    case ISD::SELECT_CC: {
+      return CombineMinMax(N, DAG);
+    }
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    assert(!N->getValueType(0).isVector() &&
+           "Vector handling of BFE not implemented");
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+    if (WidthVal == 0)
+      return DAG.getConstant(0, MVT::i32);
+
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    SDValue BitsFrom = N->getOperand(0);
+    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+    if (OffsetVal == 0) {
+      // This is already sign / zero extended, so try to fold away extra BFEs.
+      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+      if (OpSignBits >= SignBits)
+        return BitsFrom;
+
+      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+      if (Signed) {
+        // This is a sign_extend_inreg. Replace it to take advantage of existing
+        // DAG Combines. If not eliminated, we will match back to BFE during
+        // selection.
+
+        // TODO: The sext_inreg of extended types ends, although we can could
+        // handle them in a single BFE.
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+                           DAG.getValueType(SmallVT));
+      }
+
+      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+    }
+
+    if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (Signed) {
+        return constantFoldBFE<int32_t>(DAG,
+                                        Val->getSExtValue(),
+                                        OffsetVal,
+                                        WidthVal);
+      }
+
+      return constantFoldBFE<uint32_t>(DAG,
+                                       Val->getZExtValue(),
+                                       OffsetVal,
+                                       WidthVal);
+    }
+
+    APInt Demanded = APInt::getBitsSet(32,
+                                       OffsetVal,
+                                       OffsetVal + WidthVal);
+
+    if ((OffsetVal + WidthVal) >= 32) {
+      SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
+      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                         BitsFrom, ShiftVal);
+    }
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+        TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
+  }
+  return SDValue();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1181,7 +1507,7 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
 
 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   // AMDIL DAG nodes
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
@@ -1202,6 +1528,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(MUL_U24)
+  NODE_NAME_CASE(MUL_I24)
+  NODE_NAME_CASE(MAD_U24)
+  NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(EXPORT)
@@ -1219,22 +1549,22 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-static void computeMaskedBitsForMinMax(const SDValue Op0,
-                                       const SDValue Op1,
-                                       APInt &KnownZero,
-                                       APInt &KnownOne,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth) {
+static void computeKnownBitsForMinMax(const SDValue Op0,
+                                      const SDValue Op1,
+                                      APInt &KnownZero,
+                                      APInt &KnownOne,
+                                      const SelectionDAG &DAG,
+                                      unsigned Depth) {
   APInt Op0Zero, Op0One;
   APInt Op1Zero, Op1One;
-  DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
-  DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
+  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
+  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
 
   KnownZero = Op0Zero & Op1Zero;
   KnownOne = Op0One & Op1One;
 }
 
-void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   const SDValue Op,
   APInt &KnownZero,
   APInt &KnownOne,
@@ -1242,8 +1572,14 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
   unsigned Depth) const {
 
   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+  APInt KnownZero2;
+  APInt KnownOne2;
   unsigned Opc = Op.getOpcode();
+
   switch (Opc) {
+  default:
+    break;
   case ISD::INTRINSIC_WO_CHAIN: {
     // FIXME: The intrinsic should just use the node.
     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
@@ -1251,8 +1587,8 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
     case AMDGPUIntrinsic::AMDGPU_umax:
     case AMDGPUIntrinsic::AMDGPU_imin:
     case AMDGPUIntrinsic::AMDGPU_umin:
-      computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
-                                 KnownZero, KnownOne, DAG, Depth);
+      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
+                                KnownZero, KnownOne, DAG, Depth);
       break;
     default:
       break;
@@ -1264,10 +1600,62 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
   case AMDGPUISD::UMAX:
   case AMDGPUISD::SMIN:
   case AMDGPUISD::UMIN:
-    computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
-                               KnownZero, KnownOne, DAG, Depth);
+    computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
+                              KnownZero, KnownOne, DAG, Depth);
     break;
-  default:
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CWidth)
+      return;
+
+    unsigned BitWidth = 32;
+    uint32_t Width = CWidth->getZExtValue() & 0x1f;
+    if (Width == 0) {
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getNullValue(BitWidth);
+      return;
+    }
+
+    // FIXME: This could do a lot more. If offset is 0, should be the same as
+    // sign_extend_inreg implementation, but that involves duplicating it.
+    if (Opc == AMDGPUISD::BFE_I32)
+      KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+    else
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
     break;
   }
+  }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case AMDGPUISD::BFE_I32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!Width)
+      return 1;
+
+    unsigned SignBits = 32 - Width->getZExtValue() + 1;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Offset || !Offset->isNullValue())
+      return SignBits;
+
+    // TODO: Could probably figure something out with non-0 offsets.
+    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    return std::max(SignBits, Op0SignBits);
+  }
+
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+  }
+
+  default:
+    return 1;
+  }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index a019616..d5d821d 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -29,9 +29,6 @@ protected:
   const AMDGPUSubtarget *Subtarget;
 
 private:
-  void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
-                             SmallVectorImpl<SDValue> &Args,
-                             unsigned Start, unsigned Count) const;
   SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
                                    const SDValue &InitPtr,
                                    SDValue Chain,
@@ -44,7 +41,7 @@ private:
   /// of the same bitwidth.
   SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
-  /// \returns The resulting chain. 
+  /// \returns The resulting chain.
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
@@ -83,62 +80,67 @@ protected:
 public:
   AMDGPUTargetLowering(TargetMachine &TM);
 
-  virtual bool isFAbsFree(EVT VT) const override;
-  virtual bool isFNegFree(EVT VT) const override;
-  virtual bool isTruncateFree(EVT Src, EVT Dest) const override;
-  virtual bool isTruncateFree(Type *Src, Type *Dest) const override;
-
-  virtual bool isZExtFree(Type *Src, Type *Dest) const override;
-  virtual bool isZExtFree(EVT Src, EVT Dest) const override;
-
-  virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
-
-  virtual MVT getVectorIdxTy() const override;
-  virtual bool isLoadBitCastBeneficial(EVT, EVT) const override;
-  virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                              bool isVarArg,
-                              const SmallVectorImpl<ISD::OutputArg> &Outs,
-                              const SmallVectorImpl<SDValue> &OutVals,
-                              SDLoc DL, SelectionDAG &DAG) const;
-  virtual SDValue LowerCall(CallLoweringInfo &CLI,
-                            SmallVectorImpl<SDValue> &InVals) const {
-    CLI.Callee.dump();
-    llvm_unreachable("Undefined function");
-  }
+  bool isFAbsFree(EVT VT) const override;
+  bool isFNegFree(EVT VT) const override;
+  bool isTruncateFree(EVT Src, EVT Dest) const override;
+  bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+  bool isZExtFree(Type *Src, Type *Dest) const override;
+  bool isZExtFree(EVT Src, EVT Dest) const override;
+
+  bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual void ReplaceNodeResults(SDNode * N,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const override;
+  MVT getVectorIdxTy() const override;
+  bool isLoadBitCastBeneficial(EVT, EVT) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 
   SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
-  virtual const char* getTargetNodeName(unsigned Opcode) const;
+  SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const;
+  const char* getTargetNodeName(unsigned Opcode) const override;
 
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const {
+  virtual SDNode *PostISelFolding(MachineSDNode *N,
+                                  SelectionDAG &DAG) const {
     return N;
   }
 
   /// \brief Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
-  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                              APInt &KnownZero,
-                                              APInt &KnownOne,
-                                              const SelectionDAG &DAG,
-                                              unsigned Depth = 0) const override;
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     APInt &KnownZero,
+                                     APInt &KnownOne,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  virtual unsigned ComputeNumSignBitsForTargetNode(
+    SDValue Op,
+    const SelectionDAG &DAG,
+    unsigned Depth = 0) const override;
 
 // Functions defined in AMDILISelLowering.cpp
 public:
-  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                  const CallInst &I, unsigned Intrinsic) const;
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                          const CallInst &I, unsigned Intrinsic) const override;
 
   /// We want to mark f32/f64 floating point values as legal.
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
   /// We don't want to shrink f64/f32 constants.
-  bool ShouldShrinkFPConstant(EVT VT) const;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
 private:
   void InitAMDILLowering();
@@ -158,7 +160,6 @@ private:
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
   EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace AMDGPUISD {
@@ -188,6 +189,10 @@ enum {
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
+  MUL_U24,
+  MUL_I24,
+  MAD_U24,
+  MAD_I24,
   TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index e32dd9f..1c3361a 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -20,14 +20,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRINFO_NAMED_OPS
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
-using namespace llvm;
-
-
 // Pin the vtable to this file.
 void AMDGPUInstrInfo::anchor() {}
 
@@ -85,7 +84,7 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineBasicBlock::iterator &MBBI,
                                       LiveVariables *LV) const {
 // TODO: Implement this function
-  return NULL;
+  return nullptr;
 }
 bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
                                         MachineBasicBlock &MBB) const {
@@ -176,7 +175,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       int FrameIndex) const {
 // TODO: Implement this function
-  return 0;
+  return nullptr;
 }
 MachineInstr*
 AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -184,7 +183,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const {
   // TODO: Implement this function
-  return 0;
+  return nullptr;
 }
 bool
 AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
@@ -356,3 +355,14 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
   }
 }
+
+// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned 
+// instead.
+namespace llvm {
+namespace AMDGPU {
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcode(Opcode);
+}
+}
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 426910c..74baf6b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -52,14 +52,15 @@ public:
   virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
 
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const;
+                             unsigned &DstReg, unsigned &SubIdx) const override;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
   unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                     int &FrameIndex) const;
+                                     int &FrameIndex) const override;
   bool hasLoadFromStackSlot(const MachineInstr *MI,
                             const MachineMemOperand *&MMO,
-                            int &FrameIndex) const;
+                            int &FrameIndex) const override;
   unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
                                       int &FrameIndex) const;
@@ -70,7 +71,7 @@ public:
   MachineInstr *
   convertToThreeAddress(MachineFunction::iterator &MFI,
                         MachineBasicBlock::iterator &MBBI,
-                        LiveVariables *LV) const;
+                        LiveVariables *LV) const override;
 
 
   virtual void copyPhysReg(MachineBasicBlock &MBB,
@@ -78,61 +79,62 @@ public:
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const = 0;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const;
+                                      int FrameIndex) const override;
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
-                                      MachineInstr *LoadMI) const;
+                                      MachineInstr *LoadMI) const override;
   /// \returns the smallest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
 
   /// \returns the largest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
 
 public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
-                            const SmallVectorImpl<unsigned> &Ops) const;
+                           const SmallVectorImpl<unsigned> &Ops) const override;
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
+                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
   bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                           SmallVectorImpl<SDNode *> &NewNodes) const;
+                           SmallVectorImpl<SDNode *> &NewNodes) const override;
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                      bool UnfoldLoad, bool UnfoldStore,
-                                      unsigned *LoadRegIndex = 0) const;
+                               bool UnfoldLoad, bool UnfoldStore,
+                               unsigned *LoadRegIndex = nullptr) const override;
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
-                               unsigned NumLoads) const;
+                               unsigned NumLoads) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   void insertNoop(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MI) const;
-  bool isPredicated(const MachineInstr *MI) const;
+                  MachineBasicBlock::iterator MI) const override;
+  bool isPredicated(const MachineInstr *MI) const override;
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
   bool DefinesPredicate(MachineInstr *MI,
-                        std::vector<MachineOperand> &Pred) const;
-  bool isPredicable(MachineInstr *MI) const;
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+                        std::vector<MachineOperand> &Pred) const override;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
   // Helper functions that check the opcode for status information
   bool isLoadInst(llvm::MachineInstr *MI) const;
@@ -186,8 +188,7 @@ public:
 
   /// \brief Convert the AMDIL MachineInstr to a supported ISA
   /// MachineInstr
-  virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
-    DebugLoc DL) const;
+  void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const;
 
   /// \brief Build a MOV instruction.
   virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 69d8059..f96dbb4 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -92,3 +92,18 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
+// Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
+// performing the mulitply.  The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+  []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+  []
+>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 505fc81..80bdf5b 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -37,6 +37,18 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+def u32imm : Operand<i32> {
+  let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8ImmOperand";
+}
+
 //===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
@@ -253,9 +265,6 @@ def FP_ONE : PatLeaf <
   [{return N->isExactlyValue(1.0);}]
 >;
 
-def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>;
-def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>;
-
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
 let usesCustomInserter = 1  in {
@@ -414,6 +423,40 @@ class UMUL24Pattern <Instruction UMUL24> : Pat <
 >;
 */
 
+class IMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_imad24 : Pat <
+    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_imul24 : Pat <
+    (AMDGPUmul_i24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_umad24 : Pat <
+    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_umul24 : Pat <
+    (AMDGPUmul_u24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
 include "R600Instructions.td"
 include "R700Instructions.td"
 include "EvergreenInstructions.td"
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index c6521d0..9ad5e72 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -49,6 +49,10 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 2c9909f..b759495 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUAsmPrinter.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
@@ -31,16 +32,30 @@
 
 using namespace llvm;
 
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
-  Ctx(ctx)
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
+  Ctx(ctx), ST(st)
 { }
 
+enum AMDGPUMCInstLower::SISubtarget
+AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
+  return AMDGPUMCInstLower::SI;
+}
+
+unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
+
+  int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
+                              AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+  if (MCOpcode == -1)
+    MCOpcode = MIOpcode;
+
+  return MCOpcode;
+}
+
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
 
+  for (const MachineOperand &MO : MI->explicit_operands()) {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
@@ -67,7 +82,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 }
 
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  AMDGPUMCInstLower MCInstLowering(OutContext);
+  AMDGPUMCInstLower MCInstLowering(OutContext,
+                               MF->getTarget().getSubtarget<AMDGPUSubtarget>());
 
 #ifdef _DEBUG
   StringRef Err;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index d7d538e..2b7f1e3 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -13,16 +13,30 @@
 
 namespace llvm {
 
+class AMDGPUSubtarget;
 class MCInst;
 class MCContext;
 class MachineInstr;
 
 class AMDGPUMCInstLower {
 
+  // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+  enum SISubtarget {
+    SI = 0
+  };
+
   MCContext &Ctx;
+  const AMDGPUSubtarget &ST;
+
+  /// Convert a member of the AMDGPUSubtarget::Generation enum to the
+  /// SISubtarget enum.
+  enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
+
+  /// Get the MC opcode for this MachineInstr.
+  unsigned getMCOpcode(unsigned MIOpcode) const;
 
 public:
-  AMDGPUMCInstLower(MCContext &ctx);
+  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
 
   /// \brief Lower a MachineInstr to an MCInst
   void lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 8fbec4e..19927fa 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -27,10 +27,10 @@ AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
 
-const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
-                                                                         const {
+const MCPhysReg*
+AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return &CalleeSavedReg;
 }
 
@@ -54,7 +54,7 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
     AMDGPU::sub15
   };
 
-  assert (Channel < array_lengthof(SubRegs));
+  assert(Channel < array_lengthof(SubRegs));
   return SubRegs[Channel];
 }
 
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 688e1a0..a7cba0d 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -30,11 +30,11 @@ class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   TargetMachine &TM;
-  static const uint16_t CalleeSavedReg;
+  static const MCPhysReg CalleeSavedReg;
 
   AMDGPURegisterInfo(TargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
+  BitVector getReservedRegs(const MachineFunction &MF) const override {
     assert(!"Unimplemented");  return BitVector();
   }
 
@@ -43,11 +43,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// \returns The ISA reg class that is equivalent to \p RC.
   virtual const TargetRegisterClass * getISARegClass(
                                          const TargetRegisterClass * RC) const {
-    assert(!"Unimplemented"); return NULL;
+    assert(!"Unimplemented"); return nullptr;
   }
 
   virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
-    assert(!"Unimplemented"); return NULL;
+    assert(!"Unimplemented"); return nullptr;
   }
 
   virtual unsigned getHWRegIndex(unsigned Reg) const {
@@ -58,11 +58,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
 
-  const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
+  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
-                           RegScavenger *RS) const;
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   unsigned getIndirectSubReg(unsigned IndirectIndex) const;
 
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index e77ab5e..f3b9932 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -16,6 +16,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "amdgpu-subtarget"
+
 #define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -28,9 +30,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   // Default card
   StringRef GPU = CPU;
   Is64bit = false;
-  DefaultSize[0] = 64;
-  DefaultSize[1] = 1;
-  DefaultSize[2] = 1;
   HasVertexCache = false;
   TexVTXClauseSize = 0;
   Gen = AMDGPUSubtarget::R600;
@@ -106,14 +105,6 @@ bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
 }
-size_t
-AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
-  if (dim > 2) {
-    return 1;
-  } else {
-    return DefaultSize[dim];
-  }
-}
 
 std::string
 AMDGPUSubtarget::getDeviceName() const {
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 8874d14..1b041d6 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -38,7 +38,6 @@ public:
   };
 
 private:
-  size_t DefaultSize[3];
   std::string DevName;
   bool Is64bit;
   bool Is32on64bit;
@@ -60,7 +59,7 @@ public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
-  virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   bool is64bit() const;
   bool hasVertexCache() const;
@@ -77,20 +76,28 @@ public:
     return hasBFE();
   }
 
+  bool hasMulU24() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasMulI24() const {
+    return (getGeneration() >= SOUTHERN_ISLANDS ||
+            hasCaymanISA());
+  }
+
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
   unsigned getStackEntrySize() const;
   bool hasCFAluBug() const;
 
-  virtual bool enableMachineScheduler() const {
+  bool enableMachineScheduler() const override {
     return getGeneration() <= NORTHERN_ISLANDS;
   }
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
   std::string getDeviceName() const;
-  virtual size_t getDefaultSize(uint32_t dim) const;
   bool dumpCode() const { return DumpCode; }
   bool r600ALUEncoding() const { return R600ALUInst; }
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index b11fce3..174fdca 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -42,7 +42,7 @@ extern "C" void LLVMInitializeR600Target() {
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, new R600SchedStrategy());
+  return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
 }
 
 static MachineSchedRegistry
@@ -54,7 +54,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
 
   if (ST.is64bit()) {
     // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
   }
 
   Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
@@ -103,20 +103,20 @@ public:
     return getTM<AMDGPUTargetMachine>();
   }
 
-  virtual ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const {
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
     const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       return createR600MachineScheduler(C);
-    return 0;
+    return nullptr;
   }
 
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // End of anonymous namespace
 
@@ -154,6 +154,7 @@ AMDGPUPassConfig::addPreISel() {
 
 bool AMDGPUPassConfig::addInstSelector() {
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+  addPass(createSILowerI1CopiesPass());
   return false;
 }
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index f942614..1287e13 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -20,7 +20,6 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDILIntrinsicInfo.h"
 #include "R600ISelLowering.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/IR/DataLayout.h"
 
 namespace llvm {
@@ -31,8 +30,8 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   const DataLayout Layout;
   AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  OwningPtr<AMDGPUInstrInfo> InstrInfo;
-  OwningPtr<AMDGPUTargetLowering> TLInfo;
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   const InstrItineraryData *InstrItins;
 
 public:
@@ -40,30 +39,32 @@ public:
                       StringRef CPU, TargetOptions Options, Reloc::Model RM,
                       CodeModel::Model CM, CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine();
-  virtual const AMDGPUFrameLowering *getFrameLowering() const {
+  const AMDGPUFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const AMDGPUIntrinsicInfo *getIntrinsicInfo() const {
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
-  virtual const AMDGPUInstrInfo *getInstrInfo() const {
+  const AMDGPUInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
   }
-  virtual const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+  const AMDGPUSubtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const AMDGPURegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
-  virtual AMDGPUTargetLowering *getTargetLowering() const {
+  AMDGPUTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
   }
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  const InstrItineraryData *getInstrItineraryData() const override {
     return InstrItins;
   }
-  virtual const DataLayout *getDataLayout() const { return &Layout; }
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  const DataLayout *getDataLayout() const override { return &Layout; }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// \brief Register R600 analysis passes with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index 51225eb..ea78f43 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "AMDGPUtti"
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -26,6 +25,8 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "AMDGPUtti"
+
 // Declare the pass initialization routine locally as target-specific passes
 // don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
@@ -45,7 +46,7 @@ class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  AMDGPUTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
@@ -55,9 +56,9 @@ public:
     initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override { pushTTIStack(this); }
+  void initializePass() override { pushTTIStack(this); }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -65,15 +66,16 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo *)this;
     return this;
   }
 
-  virtual bool hasBranchDivergence() const override;
+  bool hasBranchDivergence() const override;
 
-  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 };
@@ -109,11 +111,11 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L,
         // require us to use indirect addressing, which is slow and prone to
         // compiler bugs.  If this loop does an address calculation on an
         // alloca ptr, then we want to use a higher than normal loop unroll
-	// threshold.  This will give SROA a better chance to eliminate these
-	// allocas.
-	//
-	// Don't use the maximum allowed value here as it will make some
-	// programs way too big.
+        // threshold. This will give SROA a better chance to eliminate these
+        // allocas.
+        //
+        // Don't use the maximum allowed value here as it will make some
+        // programs way too big.
         UP.Threshold = 500;
       }
     }
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index 21ca560..f3a0391 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -8,8 +8,6 @@
 /// \file
 //==-----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "structcfg"
-
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "R600InstrInfo.h"
@@ -34,6 +32,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "structcfg"
+
 #define DEFAULT_VEC_SLOTS 8
 
 // TODO: move-begin.
@@ -135,15 +135,15 @@ public:
   static char ID;
 
   AMDGPUCFGStructurizer() :
-      MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
+      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
     initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
   }
 
-   const char *getPassName() const {
+   const char *getPassName() const override {
     return "AMDGPU Control Flow Graph structurizer Pass";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<MachineFunctionAnalysis>();
     AU.addRequired<MachineFunctionAnalysis>();
     AU.addRequired<MachineDominatorTree>();
@@ -159,7 +159,7 @@ public:
   /// sure all loops have an exit block
   bool prepare();
 
-  bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI = &TII->getRegisterInfo();
     DEBUG(MF.dump(););
@@ -168,7 +168,7 @@ public:
     MLI = &getAnalysis<MachineLoopInfo>();
     DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
     MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
     PDT = &getAnalysis<MachinePostDominatorTree>();
     DEBUG(PDT->print(dbgs()););
     prepare();
@@ -334,7 +334,7 @@ protected:
       MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
   void retireBlock(MachineBasicBlock *MBB);
-  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
 
   MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
   /// This is work around solution for findNearestCommonDominator not avaiable
@@ -361,7 +361,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
     const {
   LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
   if (It == LLInfoMap.end())
-    return NULL;
+    return nullptr;
   return (*It).second;
 }
 
@@ -632,7 +632,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
   MachineInstr *MI = &*It;
   if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
     return MI;
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
@@ -648,7 +648,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
         break;
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
@@ -658,7 +658,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
     if (instr->getOpcode() == AMDGPU::RETURN)
       return instr;
   }
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
@@ -668,7 +668,7 @@ MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
     if (MI->getOpcode() == AMDGPU::CONTINUE)
       return MI;
   }
-  return NULL;
+  return nullptr;
 }
 
 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
@@ -819,7 +819,7 @@ bool AMDGPUCFGStructurizer::run() {
 
     SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
         It;
-    MachineBasicBlock *SccBeginMBB = NULL;
+    MachineBasicBlock *SccBeginMBB = nullptr;
     int SccNumBlk = 0;  // The number of active blocks, init to a
                         // maximum possible number.
     int SccNumIter;     // Number of iteration in this SCC.
@@ -874,7 +874,7 @@ bool AMDGPUCFGStructurizer::run() {
       }
 
       if (ContNextScc)
-        SccBeginMBB = NULL;
+        SccBeginMBB = nullptr;
     } //while, "one iteration" over the function.
 
     MachineBasicBlock *EntryMBB =
@@ -933,7 +933,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   MachineBasicBlock *MBB;
   for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
        ++It, ++SccNum) {
-    std::vector<MachineBasicBlock *> &SccNext = *It;
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
     for (std::vector<MachineBasicBlock *>::const_iterator
          blockIter = SccNext.begin(), blockEnd = SccNext.end();
          blockIter != blockEnd; ++blockIter) {
@@ -1026,7 +1026,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
     // Triangle pattern, false is empty
     LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
   } else if (FalseMBB->succ_size() == 1
              && *FalseMBB->succ_begin() == TrueMBB) {
     // Triangle pattern, true is empty
@@ -1034,7 +1034,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
     std::swap(TrueMBB, FalseMBB);
     reversePredicateSetter(MBB->end());
     LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
   } else if (FalseMBB->succ_size() == 1
              && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
     LandBlk = *FalseMBB->succ_begin();
@@ -1075,13 +1075,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
 
 int AMDGPUCFGStructurizer::loopendPatternMatch() {
   std::vector<MachineLoop *> NestedLoops;
-  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
-      It != E; ++It) {
-    df_iterator<MachineLoop *> LpIt = df_begin(*It),
-        LpE = df_end(*It);
-    for (; LpIt != LpE; ++LpIt)
-      NestedLoops.push_back(*LpIt);
-  }
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E;
+       ++It)
+    for (MachineLoop *ML : depth_first(*It))
+      NestedLoops.push_back(ML);
+
   if (NestedLoops.size() == 0)
     return 0;
 
@@ -1244,7 +1242,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
     DEBUG(
       dbgs() << " not working\n";
     );
-    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
   } // walk down the postDomTree
 
   return Num;
@@ -1723,11 +1721,11 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
 
   if (!LoopHeader || !LoopLatch)
-    return NULL;
+    return nullptr;
   MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
   // Is LoopRep an infinite loop ?
   if (!BranchMI || !isUncondBranch(BranchMI))
-    return NULL;
+    return nullptr;
 
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
@@ -1860,7 +1858,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
     return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
 
   if (!Node1 || !Node2)
-    return NULL;
+    return nullptr;
 
   Node1 = Node1->getIDom();
   while (Node1) {
@@ -1869,7 +1867,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
     Node1 = Node1->getIDom();
   }
 
-  return NULL;
+  return nullptr;
 }
 
 MachineBasicBlock *
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 0761ff4..7cea803 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -39,61 +39,55 @@ using namespace llvm;
 // TargetLowering Class Implementation Begins
 //===----------------------------------------------------------------------===//
 void AMDGPUTargetLowering::InitAMDILLowering() {
-  static const int types[] = {
-    (int)MVT::i8,
-    (int)MVT::i16,
-    (int)MVT::i32,
-    (int)MVT::f32,
-    (int)MVT::f64,
-    (int)MVT::i64,
-    (int)MVT::v2i8,
-    (int)MVT::v4i8,
-    (int)MVT::v2i16,
-    (int)MVT::v4i16,
-    (int)MVT::v4f32,
-    (int)MVT::v4i32,
-    (int)MVT::v2f32,
-    (int)MVT::v2i32,
-    (int)MVT::v2f64,
-    (int)MVT::v2i64
+  static const MVT::SimpleValueType types[] = {
+    MVT::i8,
+    MVT::i16,
+    MVT::i32,
+    MVT::f32,
+    MVT::f64,
+    MVT::i64,
+    MVT::v2i8,
+    MVT::v4i8,
+    MVT::v2i16,
+    MVT::v4i16,
+    MVT::v4f32,
+    MVT::v4i32,
+    MVT::v2f32,
+    MVT::v2i32,
+    MVT::v2f64,
+    MVT::v2i64
   };
 
-  static const int IntTypes[] = {
-    (int)MVT::i8,
-    (int)MVT::i16,
-    (int)MVT::i32,
-    (int)MVT::i64
+  static const MVT::SimpleValueType IntTypes[] = {
+    MVT::i8,
+    MVT::i16,
+    MVT::i32,
+    MVT::i64
   };
 
-  static const int FloatTypes[] = {
-    (int)MVT::f32,
-    (int)MVT::f64
+  static const MVT::SimpleValueType FloatTypes[] = {
+    MVT::f32,
+    MVT::f64
   };
 
-  static const int VectorTypes[] = {
-    (int)MVT::v2i8,
-    (int)MVT::v4i8,
-    (int)MVT::v2i16,
-    (int)MVT::v4i16,
-    (int)MVT::v4f32,
-    (int)MVT::v4i32,
-    (int)MVT::v2f32,
-    (int)MVT::v2i32,
-    (int)MVT::v2f64,
-    (int)MVT::v2i64
+  static const MVT::SimpleValueType VectorTypes[] = {
+    MVT::v2i8,
+    MVT::v4i8,
+    MVT::v2i16,
+    MVT::v4i16,
+    MVT::v4f32,
+    MVT::v4i32,
+    MVT::v2f32,
+    MVT::v2i32,
+    MVT::v2f64,
+    MVT::v2i64
   };
-  const size_t NumTypes = array_lengthof(types);
-  const size_t NumFloatTypes = array_lengthof(FloatTypes);
-  const size_t NumIntTypes = array_lengthof(IntTypes);
-  const size_t NumVectorTypes = array_lengthof(VectorTypes);
 
   const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
   // These are the current register classes that are
   // supported
 
-  for (unsigned int x  = 0; x < NumTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
-
+  for (MVT VT : types) {
     setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
@@ -109,9 +103,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
       setOperationAction(ISD::SDIV, VT, Custom);
     }
   }
-  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
-
+  for (MVT VT : FloatTypes) {
     // IL does not have these operations for floating point types
     setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
     setOperationAction(ISD::SETOLT, VT, Expand);
@@ -124,9 +116,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SETULE, VT, Expand);
   }
 
-  for (unsigned int x = 0; x < NumIntTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
-
+  for (MVT VT : IntTypes) {
     // GPU also does not have divrem function for signed or unsigned
     setOperationAction(ISD::SDIVREM, VT, Expand);
 
@@ -142,9 +132,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::CTLZ, VT, Expand);
   }
 
-  for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
-
+  for (MVT VT : VectorTypes) {
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
index 762ee39..fab4a3b 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -38,7 +38,7 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
   };
 
   if (IntrID < Intrinsic::num_intrinsics) {
-    return 0;
+    return nullptr;
   }
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
       && "Invalid intrinsic ID");
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h
index 35559e2..924275a 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.h
+++ b/lib/Target/R600/AMDILIntrinsicInfo.h
@@ -34,13 +34,13 @@ enum ID {
 class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo(TargetMachine *tm);
-  std::string getName(unsigned int IntrId, Type **Tys = 0,
-                      unsigned int numTys = 0) const;
-  unsigned int lookupName(const char *Name, unsigned int Len) const;
-  bool isOverloaded(unsigned int IID) const;
+  std::string getName(unsigned int IntrId, Type **Tys = nullptr,
+                      unsigned int numTys = 0) const override;
+  unsigned int lookupName(const char *Name, unsigned int Len) const override;
+  bool isOverloaded(unsigned int IID) const override;
   Function *getDeclaration(Module *M, unsigned int ID,
-                           Type **Tys = 0,
-                           unsigned int numTys = 0) const;
+                           Type **Tys = nullptr,
+                           unsigned int numTys = 0) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
index 658deb5..4a3e02e 100644
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ b/lib/Target/R600/AMDILIntrinsics.td
@@ -92,10 +92,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
           BinaryIntInt;
   def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
           BinaryIntInt;
-  def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
-          BinaryIntInt;
-  def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
-          BinaryIntInt;
   def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
           BinaryIntInt;
   def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 93a5117..3c6fa5a 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_target(R600CodeGen
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILowerControlFlow.cpp
+  SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
   SITypeRewriter.cpp
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index acd7bde..2630345 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -21,12 +21,14 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
 let Predicates = [isCayman] in {
 
 def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
-  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
 def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
-  [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
 >;
 
+def : IMad24Pat<MULADD_INT24_cm>;
+
 let isVector = 1 in {
 
 def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
@@ -47,6 +49,7 @@ def COS_cm : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
 
 // RECIP_UINT emulation for Cayman
 // The multiplication scales from [0,1] to the unsigned integer range
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index 6430ca6..2065441 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -75,6 +75,8 @@ def COS_eg : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
 def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 
+defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
+
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
@@ -273,7 +275,7 @@ def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
   VecALU
 >;
 
-def BFE_INT_eg : R600_3OP <0x4, "BFE_INT",
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
   [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
   VecALU
 >;
@@ -286,6 +288,13 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
   VecALU
 >;
 
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
 defm : BFIPatterns <BFI_INT_eg>;
 
 def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
@@ -294,8 +303,11 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
 >;
 
 def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
-  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
 def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
 def : ROTRPattern <BIT_ALIGN_INT_eg>;
 def MULADD_eg : MULADD_Common<0x14>;
@@ -309,7 +321,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>;
 def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
 def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
 def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
-  [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
 >;
 def DOT4_eg : DOT4_Common<0xBE>;
 defm CUBE_eg : CUBE_Common<0xC0>;
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 7105879..11ae091 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -12,6 +12,8 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -23,6 +25,21 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
   switch (reg) {
   case AMDGPU::VCC:
@@ -41,43 +58,78 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
     break;
   }
 
-  // It's seems there's no way to use SIRegisterInfo here, and dealing with the
-  // giant enum of all the different shifted sets of registers is pretty
-  // unmanagable, so parse the name and reformat it to be prettier.
-  StringRef Name(getRegisterName(reg));
-
-  std::pair<StringRef, StringRef> Split = Name.split('_');
-  StringRef SubRegName = Split.first;
-  StringRef Rest = Split.second;
+  char Type;
+  unsigned NumRegs;
+
+  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 1;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 1;
+  } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 2;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 2;
+  } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 4;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 4;
+  } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 3;
+  } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 16;
+  } else {
+    O << getRegisterName(reg);
+    return;
+  }
 
-  if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR".
-    O << Name;
+  // The low 8 bits encoding value is the register index, for both VGPRs and
+  // SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8)  - 1);
+  if (NumRegs == 1) {
+    O << Type << RegIdx;
     return;
   }
 
-  unsigned RegIndex;
-  StringRef RegIndexStr = SubRegName.drop_front(4);
+  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
 
-  if (RegIndexStr.getAsInteger(10, RegIndex)) {
-    O << Name;
+void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
     return;
   }
 
-  if (SubRegName.front() == 'V')
-    O << 'v';
-  else if (SubRegName.front() == 'S')
-    O << 's';
-  else {
-    O << Name;
+  if (Imm == FloatToBits(1.0f) ||
+      Imm == FloatToBits(-1.0f) ||
+      Imm == FloatToBits(0.5f) ||
+      Imm == FloatToBits(-0.5f) ||
+      Imm == FloatToBits(2.0f) ||
+      Imm == FloatToBits(-2.0f) ||
+      Imm == FloatToBits(4.0f) ||
+      Imm == FloatToBits(-4.0f)) {
+    O << BitsToFloat(Imm);
     return;
   }
 
-  if (Rest.empty()) // Only 1 32-bit register
-    O << RegIndex;
-  else {
-    unsigned NumReg = Rest.count('_') + 2;
-    O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']';
-  }
+  O << formatHex(static_cast<uint64_t>(Imm));
 }
 
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -95,7 +147,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       break;
     }
   } else if (Op.isImm()) {
-    O << Op.getImm();
+    printImmediate(Op.getImm(), O);
   } else if (Op.isFPImm()) {
     O << Op.getFPImm();
   } else if (Op.isExpr()) {
@@ -106,6 +158,18 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & 0x1)
+    O << "-";
+  if (InputModifiers & 0x2)
+    O << "|";
+  printOperand(MI, OpNo + 1, O);
+  if (InputModifiers & 0x2)
+    O << "|";
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 1d24680..6ca7170 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -29,13 +29,18 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
 private:
-  static void printRegOperand(unsigned RegNo, raw_ostream &O);
-  static void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
+  void printImmediate(uint32_t Imm, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  static void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                          StringRef Asm, StringRef Default = "");
   static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index a6bb59f..489cec7 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -23,8 +23,8 @@ namespace {
 class AMDGPUMCObjectWriter : public MCObjectWriter {
 public:
   AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
-  virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
-                                        const MCAsmLayout &Layout) {
+  void ExecutePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override {
     //XXX: Implement if necessary.
   }
   void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
@@ -34,7 +34,7 @@ public:
     assert(!"Not implemented");
   }
 
-  virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+  void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 
 };
 
@@ -43,19 +43,19 @@ public:
   AMDGPUAsmBackend(const Target &T)
     : MCAsmBackend() {}
 
-  virtual unsigned getNumFixupKinds() const { return 0; };
-  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                          uint64_t Value, bool IsPCRel) const;
-  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const MCAsmLayout &Layout) const {
+  unsigned getNumFixupKinds() const override { return 0; };
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
     return false;
   }
-  virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
     assert(!"Not implemented");
   }
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
-  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
     return true;
   }
 };
@@ -88,7 +88,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 public:
   ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createAMDGPUELFObjectWriter(OS);
   }
 };
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index aee9bd1..78bbe0a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -35,7 +35,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   Data16bitsDirective = ".short\t";
   Data32bitsDirective = ".long\t";
   Data64bitsDirective = ".quad\t";
-  GPRel32Directive = 0;
+  GPRel32Directive = nullptr;
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
@@ -58,5 +58,5 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
 
 const MCSection*
 AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 22afd63..59aebec 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -22,7 +22,7 @@ class StringRef;
 class AMDGPUMCAsmInfo : public MCAsmInfo {
 public:
   explicit AMDGPUMCAsmInfo(StringRef &TT);
-  const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
+  const MCSection* getNonexecutableStackSection(MCContext &CTX) const override;
 };
 } // namespace llvm
 #endif // AMDGPUMCASMINFO_H
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 6592b0e..38a2956 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -24,6 +24,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "AMDGPUGenInstrInfo.inc"
 
@@ -33,8 +35,6 @@
 #define GET_REGINFO_MC_DESC
 #include "AMDGPUGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createAMDGPUMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitAMDGPUMCInstrInfo(X);
diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
index b1beab0..74b8ca0 100644
--- a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -19,5 +19,5 @@
 type = Library
 name = R600Desc
 parent = R600
-required_libraries = R600AsmPrinter R600Info MC
+required_libraries = MC R600AsmPrinter R600Info Support
 add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 286c7d1..5e7cefe 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -41,14 +41,14 @@ public:
     : MCII(mcii), MRI(mri) { }
 
   /// \brief Encode the instruction and write it to the OS.
-  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
 private:
 
   void EmitByte(unsigned int byte, raw_ostream &OS) const;
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index f42e978..ee02111 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -54,14 +54,14 @@ public:
   ~SIMCCodeEmitter() { }
 
   /// \brief Encode the instruction and write it to the OS.
-  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
 };
 
 } // End anonymous namespace
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index fde4481..ce17d7c 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -106,3 +106,5 @@ def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
 def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
 
 def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"mullins",    SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
index 3d9015c..92bf0df 100644
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -13,7 +13,6 @@
 /// It needs to be called after IfCvt for best results.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "r600mergeclause"
 #include "AMDGPU.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
@@ -27,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "r600mergeclause"
+
 namespace {
 
 static bool isCFAlu(const MachineInstr *MI) {
@@ -62,9 +63,9 @@ private:
 public:
   R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const;
+  const char *getPassName() const override;
 };
 
 char R600ClauseMergePass::ID = 0;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index f74bef3..d255e96 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -12,7 +12,6 @@
 /// computing their address on the fly ; it also sets STACK_SIZE info.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "r600cf"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600Defines.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "r600cf"
+
 namespace {
 
 struct CFStack {
@@ -468,13 +469,13 @@ private:
 
 public:
   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (0), TRI(0),
+    TII (nullptr), TRI(nullptr),
     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
       MaxFetchInst = ST.getTexVTXClauseSize();
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
@@ -501,13 +502,13 @@ public:
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
-          LastAlu.back() = 0;
+          LastAlu.back() = nullptr;
           continue;
         }
 
         MachineBasicBlock::iterator MI = I;
         if (MI->getOpcode() != AMDGPU::ENDIF)
-          LastAlu.back() = 0;
+          LastAlu.back() = nullptr;
         if (MI->getOpcode() == AMDGPU::CF_ALU)
           LastAlu.back() = MI;
         I++;
@@ -558,7 +559,7 @@ public:
           break;
         }
         case AMDGPU::IF_PREDICATE_SET: {
-          LastAlu.push_back(0);
+          LastAlu.push_back(nullptr);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_JUMP))
               .addImm(0)
@@ -665,7 +666,7 @@ public:
     return false;
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Control Flow Finalizer Pass";
   }
 };
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 5bd793a..38afebe 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -291,12 +291,12 @@ private:
 
 public:
   static char ID;
-  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(0), Address(0) {
+  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
 
     initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
 
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
@@ -315,7 +315,7 @@ public:
     return false;
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Emit Clause Markers Pass";
   }
 };
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index ca1189d..732b06d 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -38,11 +38,11 @@ private:
 
 public:
   R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII(0) { }
+    TII(nullptr) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Expand special instructions pass";
   }
 };
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 6405a82..d6c6830 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -82,9 +82,31 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+
+  // Expand sign extension of vectors
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -117,6 +139,11 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
+  // These should be replaced by UDVIREM, but it does not happen automatically
+  // during Type Legalization
+  setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  setOperationAction(ISD::UREM, MVT::i64, Custom);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
@@ -538,8 +565,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         DAG.getConstant(2, MVT::i32), // SWZ_Z
         DAG.getConstant(3, MVT::i32) // SWZ_W
       };
-      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
-          Args, 8);
+      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
     }
 
     // default for switch(IntrinsicID)
@@ -689,7 +715,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         Op.getOperand(9),
         Op.getOperand(10)
       };
-      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
     case AMDGPUIntrinsic::AMDGPU_dp4: {
       SDValue Args[8] = {
@@ -710,7 +736,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
           DAG.getConstant(3, MVT::i32))
       };
-      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     }
 
     case Intrinsic::r600_read_ngroups_x:
@@ -960,13 +986,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   }
 
-
-  // Possible Min/Max pattern
-  SDValue MinMax = LowerMinMax(Op, DAG);
-  if (MinMax.getNode()) {
-    return MinMax;
-  }
-
   // If we make it this for it means we have no native instructions to handle
   // this SELECT_CC, so we must lower it.
   SDValue HWTrue, HWFalse;
@@ -1088,10 +1107,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         DAG.getConstant(0, MVT::i32),
         Mask
       };
-      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
       SDValue Args[3] = { Chain, Input, DWordAddr };
       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
-                                     Op->getVTList(), Args, 3, MemVT,
+                                     Op->getVTList(), Args, MemVT,
                                      StoreNode->getMemOperand());
     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
                Value.getValueType().bitsGE(MVT::i32)) {
@@ -1131,7 +1150,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (ValueVT.isVector()) {
     unsigned NumElemVT = ValueVT.getVectorNumElements();
     EVT ElemVT = ValueVT.getVectorElementType();
-    SDValue Stores[4];
+    SmallVector<SDValue, 4> Stores(NumElemVT);
 
     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
                                       "vector width in load");
@@ -1148,7 +1167,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
                               Chain, Elem, Ptr,
                               DAG.getTargetConstant(Channel, MVT::i32));
     }
-     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
    } else {
     if (ValueVT == MVT::i8) {
       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
@@ -1212,10 +1231,11 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 
   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
   if (Ret.getNode()) {
-    SDValue Ops[2];
-    Ops[0] = Ret;
-    Ops[1] = Chain;
-    return DAG.getMergeValues(Ops, 2, DL);
+    SDValue Ops[2] = {
+      Ret,
+      Chain
+    };
+    return DAG.getMergeValues(Ops, DL);
   }
 
 
@@ -1224,7 +1244,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       SplitVectorLoad(Op, DAG),
       Chain
     };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1232,8 +1252,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
-        isa<Constant>(LoadNode->getSrcValue()) ||
+    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
         isa<ConstantSDNode>(Ptr)) {
       SDValue Slots[4];
       for (unsigned i = 0; i < 4; i++) {
@@ -1252,7 +1272,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
         NewVT = VT;
         NumElements = VT.getVectorNumElements();
       }
-      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
+                           makeArrayRef(Slots, NumElements));
     } else {
       // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
@@ -1268,10 +1289,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     }
 
     SDValue MergedValues[2] = {
-        Result,
-        Chain
+      Result,
+      Chain
     };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   // For most operations returning SDValue() will result in the node being
@@ -1295,7 +1316,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
 
     SDValue MergedValues[2] = { Sra, Chain };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1332,7 +1353,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       Loads[i] = DAG.getUNDEF(ElemVT);
     }
     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
-    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
+    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
   } else {
     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
                               Chain, Ptr,
@@ -1340,11 +1361,12 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
                               Op.getOperand(2));
   }
 
-  SDValue Ops[2];
-  Ops[0] = LoweredLoad;
-  Ops[1] = Chain;
+  SDValue Ops[2] = {
+    LoweredLoad,
+    Chain
+  };
 
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 /// XXX Only kernel functions are supported, so we can assume for now that
@@ -1365,8 +1387,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
   SmallVector<ISD::InputArg, 8> LocalIns;
 
-  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
-                          LocalIns);
+  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
 
   AnalyzeFormalArguments(CCInfo, LocalIns);
 
@@ -1392,32 +1413,38 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
-    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+
+    // FIXME: This should really check the extload type, but the handling of
+    // extload vecto parameters seems to be broken.
+    //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+    ISD::LoadExtType Ext = ISD::SEXTLOAD;
+    SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
                                  MachinePointerInfo(UndefValue::get(PtrTy)),
                                  MemVT, false, false, 4);
-    // 4 is the preferred alignment for
-    // the CONSTANT memory space.
+
+    // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
 }
 
 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-   if (!VT.isVector()) return MVT::i32;
+   if (!VT.isVector())
+     return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
 }
 
-static SDValue
-CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
-                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
+static SDValue CompactSwizzlableVector(
+  SelectionDAG &DAG, SDValue VectorEntry,
+  DenseMap<unsigned, unsigned> &RemapSwizzle) {
   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
   SDValue NewBldVec[4] = {
-      VectorEntry.getOperand(0),
-      VectorEntry.getOperand(1),
-      VectorEntry.getOperand(2),
-      VectorEntry.getOperand(3)
+    VectorEntry.getOperand(0),
+    VectorEntry.getOperand(1),
+    VectorEntry.getOperand(2),
+    VectorEntry.getOperand(3)
   };
 
   for (unsigned i = 0; i < 4; i++) {
@@ -1448,7 +1475,7 @@ CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
   }
 
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-      VectorEntry.getValueType(), NewBldVec, 4);
+                     VectorEntry.getValueType(), NewBldVec);
 }
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
@@ -1486,7 +1513,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   }
 
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-      VectorEntry.getValueType(), NewBldVec, 4);
+                     VectorEntry.getValueType(), NewBldVec);
 }
 
 
@@ -1524,6 +1551,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
 
   switch (N->getOpcode()) {
+  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   case ISD::FP_ROUND: {
       SDValue Arg = N->getOperand(0);
@@ -1613,8 +1641,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     // Return the new vector
-    return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                       VT, &Ops[0], Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
 
   // Extract_vec (Build_vector) generated by custom lowering
@@ -1638,6 +1665,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   }
 
   case ISD::SELECT_CC: {
+    // Try common optimizations
+    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
     //      selectcc x, y, a, b, inv(cc)
     //
@@ -1697,7 +1729,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     };
     SDLoc DL(N);
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
-    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
   }
   case AMDGPUISD::TEXTURE_FETCH: {
     SDValue Arg = N->getOperand(1);
@@ -1727,10 +1759,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     };
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
-        NewArgs, 19);
+        NewArgs);
   }
   }
-  return SDValue();
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 static bool
@@ -1779,8 +1812,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
     };
     std::vector<unsigned> Consts;
-    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
-      int OtherSrcIdx = SrcIndices[i];
+    for (int OtherSrcIdx : SrcIndices) {
       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
         continue;
@@ -1791,14 +1823,14 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
       if (RegisterSDNode *Reg =
           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
         if (Reg->getReg() == AMDGPU::ALU_CONST) {
-          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
-              ParentNode->getOperand(OtherSelIdx));
+          ConstantSDNode *Cst
+            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
           Consts.push_back(Cst->getZExtValue());
         }
       }
     }
 
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
     Consts.push_back(Cst->getZExtValue());
     if (!TII->fitsConstReadLimitations(Consts)) {
       return false;
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 22ef728..a8a464f 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -24,21 +24,21 @@ class R600InstrInfo;
 class R600TargetLowering : public AMDGPUTargetLowering {
 public:
   R600TargetLowering(TargetMachine &TM);
-  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
-      MachineBasicBlock * BB) const;
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  virtual void ReplaceNodeResults(SDNode * N,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const override;
-  virtual SDValue LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const;
-  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const;
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(
+                              SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SDLoc DL, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const override;
+  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
@@ -66,7 +66,7 @@ private:
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 };
 
 } // End namespace llvm;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 0281dd0..b0d9ae3 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -23,11 +23,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
-using namespace llvm;
-
 R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
     RI(tm),
@@ -677,7 +677,7 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
       return MI;
   }
 
-  return NULL;
+  return nullptr;
 }
 
 static
@@ -797,7 +797,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
                             DebugLoc DL) const {
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty()) {
       BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
       return 1;
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index d5ff4de..b5304a0 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -50,13 +50,13 @@ namespace llvm {
 
   explicit R600InstrInfo(AMDGPUTargetMachine &tm);
 
-  const R600RegisterInfo &getRegisterInfo() const;
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  const R600RegisterInfo &getRegisterInfo() const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI) const;
+                           MachineBasicBlock::iterator MBBI) const override;
 
   bool isTrig(const MachineInstr &MI) const;
   bool isPlaceHolderOpcode(unsigned opcode) const;
@@ -142,79 +142,79 @@ namespace llvm {
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  virtual unsigned getIEQOpcode() const;
-  virtual bool isMov(unsigned Opcode) const;
+  unsigned getIEQOpcode() const override;
+  bool isMov(unsigned Opcode) const override;
 
   DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
-                                           const ScheduleDAG *DAG) const;
+                                           const ScheduleDAG *DAG) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
+                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
 
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
 
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  bool isPredicated(const MachineInstr *MI) const;
+  bool isPredicated(const MachineInstr *MI) const override;
 
-  bool isPredicable(MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
 
   bool
    isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             const BranchProbability &Probability) const;
+                             const BranchProbability &Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
                            unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const ;
+                           const BranchProbability &Probability) const override ;
 
   bool
    isProfitableToIfCvt(MachineBasicBlock &TMBB,
                        unsigned NumTCycles, unsigned ExtraTCycles,
                        MachineBasicBlock &FMBB,
                        unsigned NumFCycles, unsigned ExtraFCycles,
-                       const BranchProbability &Probability) const;
+                       const BranchProbability &Probability) const override;
 
   bool DefinesPredicate(MachineInstr *MI,
-                                  std::vector<MachineOperand> &Pred) const;
+                                  std::vector<MachineOperand> &Pred) const override;
 
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                         const SmallVectorImpl<MachineOperand> &Pred2) const override;
 
   bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                          MachineBasicBlock &FMBB) const;
+                                          MachineBasicBlock &FMBB) const override;
 
   bool PredicateInstruction(MachineInstr *MI,
-                        const SmallVectorImpl<MachineOperand> &Pred) const;
+                        const SmallVectorImpl<MachineOperand> &Pred) const override;
 
-  unsigned int getPredicationCost(const MachineInstr *) const;
+  unsigned int getPredicationCost(const MachineInstr *) const override;
 
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
                                const MachineInstr *MI,
-                               unsigned *PredCost = 0) const;
+                               unsigned *PredCost = nullptr) const override;
 
-  virtual int getInstrLatency(const InstrItineraryData *ItinData,
-                              SDNode *Node) const { return 1;}
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const override { return 1;}
 
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const;
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
 
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
 
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned ValueReg, unsigned Address,
-                                  unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                          MachineBasicBlock::iterator I,
+                          unsigned ValueReg, unsigned Address,
+                          unsigned OffsetReg) const override;
 
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned ValueReg, unsigned Address,
-                                  unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg) const override;
 
   unsigned getMaxAlusPerClause() const;
 
@@ -244,7 +244,7 @@ namespace llvm {
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const;
+                              unsigned DstReg, unsigned SrcReg) const override;
 
   /// \brief Get the index of Op in the MachineInstr.
   ///
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index d2075c0..590fde2 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1625,6 +1625,12 @@ def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
 
+let Predicates = [isR600] in {
+// Intrinsic patterns
+defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
+defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
+} // End isR600
+
 def getLDSNoRetOp : InstrMapping {
   let FilterClass = "R600_LDS_1A1D";
   let RowFields = ["BaseOp"];
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index c1bec0a..b0ae22e 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -21,7 +21,7 @@
 namespace llvm {
 
 class R600MachineFunctionInfo : public AMDGPUMachineFunction {
-  virtual void anchor();
+  void anchor() override;
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
   SmallVector<unsigned, 4> LiveOuts;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d3ffb50..d1655d1 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "misched"
-
 #include "R600MachineScheduler.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -23,6 +21,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "misched"
+
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
   DAG = static_cast<ScheduleDAGMILive*>(dag);
@@ -56,7 +56,7 @@ unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
 }
 
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
-  SUnit *SU = 0;
+  SUnit *SU = nullptr;
   NextInstKind = IDOther;
 
   IsTopNode = false;
@@ -316,7 +316,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
 
 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
   if (Q.empty())
-    return NULL;
+    return nullptr;
   for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
       It != E; ++It) {
     SUnit *SU = *It;
@@ -331,7 +331,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
       InstructionsGroupCandidate.pop_back();
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 void R600SchedStrategy::LoadAlu() {
@@ -448,11 +448,11 @@ SUnit* R600SchedStrategy::pickAlu() {
     }
     PrepareNextSlot();
   }
-  return NULL;
+  return nullptr;
 }
 
 SUnit* R600SchedStrategy::pickOther(int QID) {
-  SUnit *SU = 0;
+  SUnit *SU = nullptr;
   std::vector<SUnit *> &AQ = Available[QID];
 
   if (AQ.empty()) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index b909ff7..fd475af 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -68,17 +68,16 @@ class R600SchedStrategy : public MachineSchedStrategy {
 
 public:
   R600SchedStrategy() :
-    DAG(0), TII(0), TRI(0), MRI(0) {
+    DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
   }
 
-  virtual ~R600SchedStrategy() {
-  }
+  virtual ~R600SchedStrategy() {}
 
-  virtual void initialize(ScheduleDAGMI *dag);
-  virtual SUnit *pickNode(bool &IsTopNode);
-  virtual void schedNode(SUnit *SU, bool IsTopNode);
-  virtual void releaseTopNode(SUnit *SU);
-  virtual void releaseBottomNode(SUnit *SU);
+  void initialize(ScheduleDAGMI *dag) override;
+  SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+  void releaseTopNode(SUnit *SU) override;
+  void releaseBottomNode(SUnit *SU) override;
 
 private:
   std::vector<MachineInstr *> InstructionsGroupCandidate;
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index 767e5e3..2314136 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -27,7 +27,6 @@
 /// to reduce MOV count.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "vec-merger"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
@@ -42,6 +41,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "vec-merger"
+
 namespace {
 
 static bool
@@ -107,9 +108,9 @@ private:
 public:
   static char ID;
   R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
-  TII(0) { }
+  TII(nullptr) { }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
@@ -118,11 +119,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Vector Registers Merge Pass";
   }
 
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 char R600VectorRegMerger::ID = 0;
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index b7b7610..c2f6c03 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "packets"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
@@ -28,6 +27,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "packets"
+
 namespace {
 
 class R600Packetizer : public MachineFunctionPass {
@@ -36,7 +37,7 @@ public:
   static char ID;
   R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
@@ -45,11 +46,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Packetizer";
   }
 
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 char R600Packetizer::ID = 0;
 
@@ -155,18 +156,19 @@ public:
   }
 
   // initPacketizerState - initialize some internal flags.
-  void initPacketizerState() {
+  void initPacketizerState() override {
     ConsideredInstUsesAlreadyWrittenVectorElement = false;
   }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
+  bool ignorePseudoInstruction(MachineInstr *MI,
+                               MachineBasicBlock *MBB) override {
     return false;
   }
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(MachineInstr *MI) {
+  bool isSoloInstruction(MachineInstr *MI) override {
     if (TII->isVector(*MI))
       return true;
     if (!TII->isALUInstr(MI->getOpcode()))
@@ -182,7 +184,7 @@ public:
 
   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
   // together.
-  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
     if (getSlot(MII) == getSlot(MIJ))
       ConsideredInstUsesAlreadyWrittenVectorElement = true;
@@ -219,7 +221,9 @@ public:
 
   // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
   // and SUJ.
-  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;}
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+    return false;
+  }
 
   void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
     unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
@@ -288,7 +292,7 @@ public:
     return true;
   }
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
     MachineBasicBlock::iterator FirstInBundle =
         CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
     const DenseMap<unsigned, unsigned> &PV =
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index c74c49e..52e1a4b 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -28,27 +28,28 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
 
   R600RegisterInfo(AMDGPUTargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   /// \param RC is an AMDIL reg class.
   ///
   /// \returns the R600 reg class that is equivalent to \p RC.
-  virtual const TargetRegisterClass *getISARegClass(
-    const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *getISARegClass(
+    const TargetRegisterClass *RC) const override;
 
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
-  virtual unsigned getHWRegIndex(unsigned Reg) const;
+  unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
 
-  virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+  const RegClassWeight &
+    getRegClassWeight(const TargetRegisterClass *RC) const override;
 
   // \returns true if \p Reg can be defined in one ALU caluse and used in another.
-  virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
index 9d24404..419ec8b 100644
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -209,7 +209,7 @@ public:
     FunctionPass(ID) {
   }
 
-  virtual bool doInitialization(Module &M) {
+  bool doInitialization(Module &M) override {
     LLVMContext &Ctx = M.getContext();
     Mod = &M;
     FloatType = Type::getFloatTy(Ctx);
@@ -245,16 +245,16 @@ public:
     return false;
   }
 
-  virtual bool runOnFunction(Function &F) {
+  bool runOnFunction(Function &F) override {
     visit(F);
     return false;
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Texture Intrinsics Replacer";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
   }
 
   void visitCallInst(CallInst &I) {
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index f9214a8..d6e4451 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "si-annotate-control-flow"
-
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/IR/Constants.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "si-annotate-control-flow"
+
 namespace {
 
 // Complex types used in this pass
@@ -91,15 +91,15 @@ public:
   SIAnnotateControlFlow():
     FunctionPass(ID) { }
 
-  virtual bool doInitialization(Module &M);
+  bool doInitialization(Module &M) override;
 
-  virtual bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI annotate control flow";
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
@@ -118,7 +118,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Void = Type::getVoidTy(Context);
   Boolean = Type::getInt1Ty(Context);
   Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
 
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
@@ -126,25 +126,25 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Int64Zero = ConstantInt::get(Int64, 0);
 
   If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
 
   Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
 
   Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)0);
+    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
 
   IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
 
   ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
 
   Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)0);
+    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
 
   EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)0);
+    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
 
   return false;
 }
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 402f1f4..5f71453 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -65,7 +65,6 @@
 /// ultimately led to the creation of an illegal COPY.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sgpr-copies"
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -77,6 +76,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sgpr-copies"
+
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
@@ -97,9 +98,9 @@ private:
 public:
   SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI Fix SGPR copies";
   }
 
@@ -184,7 +185,8 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
   const TargetRegisterClass *SrcRC;
 
   if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      DstRC == &AMDGPU::M0RegRegClass)
+      DstRC == &AMDGPU::M0RegRegClass ||
+      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
     return false;
 
   SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
@@ -256,6 +258,19 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         TII->moveToVALU(MI);
         break;
       }
+      case AMDGPU::INSERT_SUBREG: {
+        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        if (TRI->isSGPRClass(DstRC) &&
+            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+          DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n");
+          DEBUG(MI.print(dbgs()));
+          TII->moveToVALU(MI);
+        }
+        break;
+      }
       }
     }
   }
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 0b55411..c9e247c 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -29,22 +29,21 @@ using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM) {
-  addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
-  addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
-  addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
 
-  addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
 
-  addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
-  addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass);
 
   addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -78,8 +77,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
   setOperationAction(ISD::ADDE, MVT::i32, Legal);
 
-  setOperationAction(ISD::BITCAST, MVT::i128, Legal);
-
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
@@ -99,10 +96,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::i1, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::i64, Custom);
-  setOperationAction(ISD::STORE, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::SELECT, MVT::f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -119,6 +117,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
 
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
@@ -126,39 +140,48 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
 
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
   setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
 
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
+  // These should use UDIVREM, so set them to expand
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   MVT VecTypes[] = {
     MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
   };
 
-  const size_t NumVecTypes = array_lengthof(VecTypes);
-  for (unsigned Type = 0; Type < NumVecTypes; ++Type) {
+  for (MVT VT : VecTypes) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch(Op) {
       case ISD::LOAD:
@@ -172,7 +195,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
       case ISD::EXTRACT_SUBVECTOR:
         break;
       default:
-        setOperationAction(Op, VecTypes[Type], Expand);
+        setOperationAction(Op, VT, Expand);
         break;
       }
     }
@@ -189,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
   setTargetDAGCombine(ISD::SELECT_CC);
@@ -204,10 +228,40 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
                                                      unsigned AddrSpace,
                                                      bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
   // XXX: This depends on the address space and also we may want to revist
   // the alignment values we specify in the DataLayout.
+
+  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+  // which isn't a simple VT.
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
+
+  // XXX - CI changes say "Support for unaligned memory accesses" but I don't
+  // see what for specifically. The wording everywhere else seems to be the
+  // same.
+
+  // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have
+  // no alignment restrictions.
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
+    // Using any pair of GPRs should be the same as any other pair.
+    if (IsFast)
+      *IsFast = true;
+    return VT.bitsGE(MVT::i64);
+  }
+
+  // XXX - The only mention I see of this in the ISA manual is for LDS direct
+  // reads the "byte address and must be dword aligned". Is it also true for the
+  // normal loads and stores?
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS)
+    return false;
+
+  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+  // byte-address are ignored, thus forcing Dword alignment.
+  if (IsFast)
+    *IsFast = true;
   return VT.bitsGT(MVT::i32);
 }
 
@@ -224,7 +278,7 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
 
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc DL, SDValue Chain,
-                                         unsigned Offset) const {
+                                         unsigned Offset, bool Signed) const {
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                             AMDGPUAS::CONSTANT_ADDRESS);
@@ -232,7 +286,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                            MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                              DAG.getConstant(Offset, MVT::i64));
-  return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr,
+  return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
                             MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
                             false, false, MemVT.getSizeInBits() >> 3);
 
@@ -340,7 +394,8 @@ SDValue SITargetLowering::LowerFormalArguments(
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
-                                   36 + VA.getLocMemOffset());
+                                   36 + VA.getLocMemOffset(),
+                                   Ins[i].Flags.isSExt());
       InVals.push_back(Arg);
       continue;
     }
@@ -381,8 +436,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       for (unsigned j = 0; j != NumElements; ++j)
         Regs.push_back(DAG.getUNDEF(VT));
 
-      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
-                                   Regs.data(), Regs.size()));
+      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
       continue;
     }
 
@@ -395,15 +449,15 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
   MachineBasicBlock::iterator I = *MI;
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
   case AMDGPU::SI_ADDR64_RSRC: {
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned SuperReg = MI->getOperand(0).getReg();
     unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
     unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
@@ -428,9 +482,7 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
-  case AMDGPU::V_SUB_F64: {
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  case AMDGPU::V_SUB_F64:
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
             MI->getOperand(0).getReg())
             .addReg(MI->getOperand(1).getReg())
@@ -442,11 +494,9 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
             .addImm(2); /* NEG */
     MI->eraseFromParent();
     break;
-  }
+
   case AMDGPU::SI_RegisterStorePseudo: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     MachineInstrBuilder MIB =
         BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
@@ -455,6 +505,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
       MIB.addOperand(MI->getOperand(i));
 
     MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FABS_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x7fffffff);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FNEG_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x80000000);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FCLAMP_SI: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
+            MI->getOperand(0).getReg())
+            .addImm(0) // SRC0 modifiers
+            .addOperand(MI->getOperand(1))
+            .addImm(0) // SRC1 modifiers
+            .addImm(0) // SRC1
+            .addImm(1) // CLAMP
+            .addImm(0); // OMOD
+    MI->eraseFromParent();
   }
   }
   return BB;
@@ -510,7 +604,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
         SplitVectorLoad(Op, DAG),
         Load->getChain()
       };
-      return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
+      return DAG.getMergeValues(MergedValues, SDLoc(Op));
     } else {
       return LowerLOAD(Op, DAG);
     }
@@ -533,23 +627,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     switch (IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case Intrinsic::r600_read_ngroups_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
     case Intrinsic::r600_read_ngroups_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
     case Intrinsic::r600_read_ngroups_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
     case Intrinsic::r600_read_global_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
     case Intrinsic::r600_read_global_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
     case Intrinsic::r600_read_global_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
     case Intrinsic::r600_read_local_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
     case Intrinsic::r600_read_local_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
     case Intrinsic::r600_read_local_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
@@ -570,7 +664,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                   AMDGPU::VGPR2, VT);
     case AMDGPUIntrinsic::SI_load_const: {
       SDValue Ops [] = {
-        ResourceDescriptorToi128(Op.getOperand(1), DAG),
+        Op.getOperand(1),
         Op.getOperand(2)
       };
 
@@ -579,7 +673,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
           MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
           VT.getSizeInBits() / 8, 4);
       return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                     Op->getVTList(), Ops, 2, VT, MMO);
+                                     Op->getVTList(), Ops, VT, MMO);
     }
     case AMDGPUIntrinsic::SI_sample:
       return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
@@ -591,7 +685,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
       return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
     case AMDGPUIntrinsic::SI_vs_load_input:
       return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                         ResourceDescriptorToi128(Op.getOperand(1), DAG),
+                         Op.getOperand(1),
                          Op.getOperand(2),
                          Op.getOperand(3));
     }
@@ -606,7 +700,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
         SDLoc DL(Op);
         SDValue Ops [] = {
           Chain,
-          ResourceDescriptorToi128(Op.getOperand(2), DAG),
+          Op.getOperand(2),
           Op.getOperand(3),
           Op.getOperand(4),
           Op.getOperand(5),
@@ -627,8 +721,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
             MachineMemOperand::MOStore,
             VT.getSizeInBits() / 8, 4);
         return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                       Op->getVTList(), Ops,
-                                       sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+                                       Op->getVTList(), Ops, VT, MMO);
       }
       default:
         break;
@@ -650,7 +743,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
     if (I->getOpcode() == Opcode)
       return *I;
   }
-  return 0;
+  return nullptr;
 }
 
 /// This transforms the control flow intrinsics to get the branch destination as
@@ -662,7 +755,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
 
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
-  SDNode *BR = 0;
+  SDNode *BR = nullptr;
 
   if (Intr->getOpcode() == ISD::SETCC) {
     // As long as we negate the condition everything is fine
@@ -695,7 +788,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   // build the new intrinsic call
   SDNode *Result = DAG.getNode(
     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
+    DAG.getVTList(Res), Ops).getNode();
 
   if (BR) {
     // Give the branch instruction our target
@@ -703,7 +796,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
       BR->getOperand(0),
       BRCOND.getOperand(2)
     };
-    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
+    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops);
   }
 
   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -739,7 +832,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   MergedValues[1] = Load->getChain();
   if (Ret.getNode()) {
     MergedValues[0] = Ret;
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -770,30 +863,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   MergedValues[0] = Ret;
-  return DAG.getMergeValues(MergedValues, 2, DL);
+  return DAG.getMergeValues(MergedValues, DL);
 
 }
 
-SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
-                                             SelectionDAG &DAG) const {
-
-  if (Op.getValueType() == MVT::i128) {
-    return Op;
-  }
-
-  assert(Op.getOpcode() == ISD::UNDEF);
-
-  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
-                     DAG.getConstant(0, MVT::i64),
-                     DAG.getConstant(0, MVT::i64));
-}
-
 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
                                                const SDValue &Op,
                                                SelectionDAG &DAG) const {
   return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
                      Op.getOperand(2),
-                     ResourceDescriptorToi128(Op.getOperand(3), DAG),
+                     Op.getOperand(3),
                      Op.getOperand(4));
 }
 
@@ -833,12 +912,6 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
-  // Possible Min/Max pattern
-  SDValue MinMax = LowerMinMax(Op, DAG);
-  if (MinMax.getNode()) {
-    return MinMax;
-  }
-
   SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 }
@@ -948,8 +1021,12 @@ SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
     return SDValue();
   }
 
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0),
-                                              DAG.getConstant(0, MVT::i32));
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() != MVT::i32)
+    Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
 }
 
 //===----------------------------------------------------------------------===//
@@ -963,7 +1040,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   EVT VT = N->getValueType(0);
 
   switch (N->getOpcode()) {
-    default: break;
+    default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
     case ISD::SELECT_CC: {
       ConstantSDNode *True, *False;
       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
@@ -982,7 +1059,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Arg0 = N->getOperand(0);
       SDValue Arg1 = N->getOperand(1);
       SDValue CC = N->getOperand(2);
-      ConstantSDNode * C = NULL;
+      ConstantSDNode * C = nullptr;
       ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
 
       // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
@@ -998,7 +1075,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       break;
     }
   }
-  return SDValue();
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 /// \brief Test if RegClass is one of the VSrc classes
@@ -1029,9 +1107,11 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
         return -1;
     }
     Imm.I = Node->getSExtValue();
-  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
+  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+    if (N->getValueType(0) != MVT::f32)
+      return -1;
     Imm.F = Node->getValueAPF().convertToFloat();
-  else
+  } else
     return -1; // It isn't an immediate
 
   if ((Imm.I >= -16 && Imm.I <= 64) ||
@@ -1051,7 +1131,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
   const SIInstrInfo *TII =
     static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-  if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
+  if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
     return false;
 
   const SDValue &Op = Mov->getOperand(0);
@@ -1098,7 +1178,7 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode(
       }
       return TRI.getPhysRegClass(Reg);
     }
-    default:  return NULL;
+    default:  return nullptr;
     }
   }
   const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
@@ -1202,17 +1282,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
 
   // Commuted opcode if available
   int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
-  const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev);
+  const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
 
   assert(!DescRev || DescRev->getNumDefs() == NumDefs);
   assert(!DescRev || DescRev->getNumOperands() == NumOps);
 
   // e64 version if available, -1 otherwise
   int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
-  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64);
+  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
+  int InputModifiers[3] = {0};
 
   assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
-  assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
 
   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
   bool HaveVSrc = false, HaveSSrc = false;
@@ -1279,17 +1359,18 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
            fitsRegClass(DAG, Ops[1], OtherRegClass))) {
 
         // Swap commutable operands
-        SDValue Tmp = Ops[1];
-        Ops[1] = Ops[0];
-        Ops[0] = Tmp;
+        std::swap(Ops[0], Ops[1]);
 
         Desc = DescRev;
-        DescRev = 0;
+        DescRev = nullptr;
         continue;
       }
     }
 
-    if (DescE64 && !Immediate) {
+    if (Immediate)
+      continue;
+
+    if (DescE64) {
 
       // Test if it makes sense to switch to e64 encoding
       unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
@@ -1305,14 +1386,46 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
         Immediate = -1;
         Promote2e64 = true;
         Desc = DescE64;
-        DescE64 = 0;
+        DescE64 = nullptr;
       }
     }
+
+    if (!DescE64 && !Promote2e64)
+      continue;
+    if (!Operand.isMachineOpcode())
+      continue;
+    if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 1;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = 0;
+    }
+    else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 2;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = 0;
+    }
   }
 
   if (Promote2e64) {
+    std::vector<SDValue> OldOps(Ops);
+    Ops.clear();
+    for (unsigned i = 0; i < OldOps.size(); ++i) {
+      // src_modifier
+      Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
+      Ops.push_back(OldOps[i]);
+    }
     // Add the modifier flags while promoting
-    for (unsigned i = 0; i < 4; ++i)
+    for (unsigned i = 0; i < 2; ++i)
       Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
   }
 
@@ -1390,7 +1503,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
     Ops.push_back(Node->getOperand(i));
-  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
+  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
 
   // If we only got one lane, replace it with a copy
   // (if NewDmask has only one bit set...)
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index ca73f53..c6eaa81 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -22,7 +22,7 @@ namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
-                         SDValue Chain, unsigned Offset) const;
+                         SDValue Chain, unsigned Offset, bool Signed) const;
   SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
                                SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -33,7 +33,6 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
   const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
@@ -49,32 +48,33 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
 public:
   SITargetLowering(TargetMachine &tm);
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const;
-  virtual bool shouldSplitVectorType(EVT VT) const override;
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                     bool *IsFast) const override;
+  bool shouldSplitVectorType(EVT VT) const override;
 
-  virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                 Type *Ty) const override;
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                        Type *Ty) const override;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                SDLoc DL, SelectionDAG &DAG,
-                               SmallVectorImpl<SDValue> &InVals) const;
+                               SmallVectorImpl<SDValue> &InVals) const override;
 
-  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
-                                              MachineBasicBlock * BB) const;
-  virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
-  virtual MVT getScalarShiftAmountTy(EVT VT) const;
-  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
-  virtual void AdjustInstrPostInstrSelection(MachineInstr *MI,
-                                             SDNode *Node) const;
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                      MachineBasicBlock * BB) const override;
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+  MVT getScalarShiftAmountTy(EVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+  void AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                     SDNode *Node) const override;
 
   int32_t analyzeImmediate(const SDNode *N) const;
   SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const;
+                               unsigned Reg, EVT VT) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 695ec40..a17fed7 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -97,13 +97,13 @@ private:
 public:
   SIInsertWaits(TargetMachine &tm) :
     MachineFunctionPass(ID),
-    TII(0),
-    TRI(0),
+    TII(nullptr),
+    TRI(nullptr),
     ExpInstrTypesSeen(0) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI insert wait  instructions";
   }
 
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index aa2c22c..168eff2 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern> {
+    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
 
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
@@ -210,16 +210,19 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
   bits<8> dst;
+  bits<2> src0_modifiers;
   bits<9> src0;
+  bits<2> src1_modifiers;
   bits<9> src1;
+  bits<2> src2_modifiers;
   bits<9> src2;
-  bits<3> abs;
   bits<1> clamp;
   bits<2> omod;
-  bits<3> neg;
 
   let Inst{7-0} = dst;
-  let Inst{10-8} = abs;
+  let Inst{8} = src0_modifiers{1};
+  let Inst{9} = src1_modifiers{1};
+  let Inst{10} = src2_modifiers{1};
   let Inst{11} = clamp;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
@@ -227,7 +230,9 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
   let Inst{60-59} = omod;
-  let Inst{63-61} = neg;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
   
   let mayLoad = 0;
   let mayStore = 0;
@@ -240,12 +245,14 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
   bits<8> dst;
+  bits<2> src0_modifiers;
   bits<9> src0;
+  bits<2> src1_modifiers;
   bits<9> src1;
+  bits<2> src2_modifiers;
   bits<9> src2;
   bits<7> sdst;
   bits<2> omod;
-  bits<3> neg;
 
   let Inst{7-0} = dst;
   let Inst{14-8} = sdst;
@@ -255,7 +262,9 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
   let Inst{60-59} = omod;
-  let Inst{63-61} = neg;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
 
   let mayLoad = 0;
   let mayStore = 0;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index ab2fe09..4a9e346 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -187,27 +187,45 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       int FrameIndex,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned KillFlag = isKill ? RegState::Kill : 0;
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
-    unsigned Lane = MFI->SpillTracker.getNextLane(MRI);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
-            MFI->SpillTracker.LaneVGPR)
+    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
             .addReg(SrcReg, KillFlag)
             .addImm(Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+  } else if (RI.isSGPRClass(RC)) {
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for vector
+    // registers.
+    //
+    // Reserve a spot in the spill tracker for each sub-register of
+    // the vector register.
+    unsigned NumSubRegs = RC->getSize() / 4;
+    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
+                                                        NumSubRegs);
     MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
-                                    Lane);
-  } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg)
-              .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
-      storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i,
-                          &AMDGPU::SReg_32RegClass, TRI);
+                                    FirstLane);
+
+    unsigned Opcode;
+    switch (RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+    default: llvm_unreachable("Cannot spill register class");
     }
+
+    BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
+            .addReg(SrcReg)
+            .addImm(FrameIndex);
+  } else {
+    llvm_unreachable("VGPR spilling not supported");
   }
 }
 
@@ -216,30 +234,125 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        unsigned DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
-     SIMachineFunctionInfo::SpilledReg Spill =
+    SIMachineFunctionInfo::SpilledReg Spill =
         MFI->SpillTracker.getSpilledReg(FrameIndex);
     assert(Spill.VGPR);
     BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
             .addReg(Spill.VGPR)
             .addImm(Spill.Lane);
+    insertNOPs(MI, 3);
+  } else if (RI.isSGPRClass(RC)){
+    unsigned Opcode;
+    switch(RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
+    default: llvm_unreachable("Cannot spill register class");
+    }
+
+    SIMachineFunctionInfo::SpilledReg Spill =
+        MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+    BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+            .addReg(Spill.VGPR)
+            .addImm(FrameIndex);
+    insertNOPs(MI, 3);
   } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned Flags = RegState::Define;
-      if (i == 0) {
-        Flags |= RegState::Undef;
-      }
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i,
-                           &AMDGPU::SReg_32RegClass, TRI);
-      BuildMI(MBB, MI, DL, get(AMDGPU::COPY))
-              .addReg(DestReg, Flags, RI.getSubRegFromChannel(i))
-              .addReg(SubReg);
+    llvm_unreachable("VGPR spilling not supported");
+  }
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+    return 2;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
+                             int Count) const {
+  while (Count > 0) {
+    int Arg;
+    if (Count >= 8)
+      Arg = 7;
+    else
+      Arg = Count - 1;
+    Count -= 8;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+            .addImm(Arg);
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  SIMachineFunctionInfo *MFI =
+      MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  switch (MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+
+  // SGPR register spill
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+    unsigned FrameIndex = MI->getOperand(2).getImm();
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
+                                            &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
+              MI->getOperand(0).getReg())
+              .addReg(SubReg)
+              .addImm(Spill.Lane + i);
+    }
+    MI->eraseFromParent();
+    break;
+  }
+
+  // SGPR register restore
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned FrameIndex = MI->getOperand(2).getImm();
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                   &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
+              .addReg(MI->getOperand(1).getReg())
+              .addImm(Spill.Lane + i);
     }
+    MI->eraseFromParent();
+    break;
   }
+  }
+  return true;
 }
 
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
@@ -247,18 +360,18 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
 
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
-    return 0;
+    return nullptr;
 
   // Cannot commute VOP2 if src0 is SGPR.
   if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
       RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
-   return 0;
+   return nullptr;
 
   if (!MI->getOperand(2).isReg()) {
     // XXX: Commute instructions with FPImm operands
     if (NewMI || MI->getOperand(2).isFPImm() ||
        (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
-      return 0;
+      return nullptr;
     }
 
     // XXX: Commute VOP3 instructions with abs and neg set.
@@ -267,7 +380,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                         AMDGPU::OpName::abs)).getImm() ||
          MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                         AMDGPU::OpName::neg)).getImm()))
-      return 0;
+      return nullptr;
 
     unsigned Reg = MI->getOperand(1).getReg();
     unsigned SubReg = MI->getOperand(1).getSubReg();
@@ -516,6 +629,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
   case AMDGPU::COPY: return AMDGPU::COPY;
   case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   case AMDGPU::S_MOV_B32:
     return MI.getOperand(1).isReg() ?
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -536,6 +650,23 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+  case AMDGPU::S_LOAD_DWORD_IMM:
+  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
   }
 }
 
@@ -559,6 +690,8 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
   switch (MI.getOpcode()) {
   case AMDGPU::COPY:
   case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::PHI:
+  case AMDGPU::INSERT_SUBREG:
     return RI.hasVGPRs(getOpRegClass(MI, 0));
   default:
     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
@@ -737,11 +870,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
   }
 
-  // Legalize REG_SEQUENCE
+  // Legalize REG_SEQUENCE and PHI
   // The register class of the operands much be the same type as the register
   // class of the output.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-    const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL;
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
+      MI->getOpcode() == AMDGPU::PHI) {
+    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
       if (!MI->getOperand(i).isReg() ||
           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
@@ -774,13 +908,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
         continue;
       unsigned DstReg = MRI.createVirtualRegister(RC);
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+      MachineBasicBlock *InsertBB;
+      MachineBasicBlock::iterator Insert;
+      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+        InsertBB = MI->getParent();
+        Insert = MI;
+      } else {
+        // MI is a PHI instruction.
+        InsertBB = MI->getOperand(i + 1).getMBB();
+        Insert = InsertBB->getFirstTerminator();
+      }
+      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
               get(AMDGPU::COPY), DstReg)
               .addOperand(MI->getOperand(i));
       MI->getOperand(i).setReg(DstReg);
     }
   }
 
+  // Legalize INSERT_SUBREG
+  // src0 must have the same register class as dst
+  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Src0 = MI->getOperand(1).getReg();
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+    if (DstRC != Src0RC) {
+      MachineBasicBlock &MBB = *MI->getParent();
+      unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+              .addReg(Src0);
+      MI->getOperand(1).setReg(NewSrc0);
+    }
+    return;
+  }
+
   // Legalize MUBUF* instructions
   // FIXME: If we start using the non-addr64 instructions for compute, we
   // may need to legalize them here.
@@ -886,6 +1047,72 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   }
 }
 
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  switch (MI->getOpcode()) {
+    case AMDGPU::S_LOAD_DWORD_IMM:
+    case AMDGPU::S_LOAD_DWORD_SGPR:
+    case AMDGPU::S_LOAD_DWORDX2_IMM:
+    case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    case AMDGPU::S_LOAD_DWORDX4_IMM:
+    case AMDGPU::S_LOAD_DWORDX4_SGPR:
+      unsigned NewOpcode = getVALUOp(*MI);
+      unsigned RegOffset;
+      unsigned ImmOffset;
+
+      if (MI->getOperand(2).isReg()) {
+        RegOffset = MI->getOperand(2).getReg();
+        ImmOffset = 0;
+      } else {
+        assert(MI->getOperand(2).isImm());
+        // SMRD instructions take a dword offsets and MUBUF instructions
+        // take a byte offset.
+        ImmOffset = MI->getOperand(2).getImm() << 2;
+        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+        if (isUInt<12>(ImmOffset)) {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(0);
+        } else {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(ImmOffset);
+          ImmOffset = 0;
+        }
+      }
+
+      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+      unsigned DWord0 = RegOffset;
+      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
+              .addImm(0);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
+              .addReg(DWord0)
+              .addImm(AMDGPU::sub0)
+              .addReg(DWord1)
+              .addImm(AMDGPU::sub1)
+              .addReg(DWord2)
+              .addImm(AMDGPU::sub2)
+              .addReg(DWord3)
+              .addImm(AMDGPU::sub3);
+     MI->setDesc(get(NewOpcode));
+     if (MI->getOperand(2).isReg()) {
+       MI->getOperand(2).setReg(MI->getOperand(1).getReg());
+     } else {
+       MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
+     }
+     MI->getOperand(1).setReg(SRsrc);
+     MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+  }
+}
+
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   SmallVector<MachineInstr *, 128> Worklist;
   Worklist.push_back(&TopInst);
@@ -895,8 +1122,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     MachineBasicBlock *MBB = Inst->getParent();
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
+    unsigned Opcode = Inst->getOpcode();
+    unsigned NewOpcode = getVALUOp(*Inst);
+
     // Handle some special cases
-    switch(Inst->getOpcode()) {
+    switch (Opcode) {
+    default:
+      if (isSMRD(Inst->getOpcode())) {
+        moveSMRDToVALU(Inst, MRI);
+      }
+      break;
     case AMDGPU::S_MOV_B64: {
       DebugLoc DL = Inst->getDebugLoc();
 
@@ -947,7 +1182,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       llvm_unreachable("Moving this op to VALU not implemented");
     }
 
-    unsigned NewOpcode = getVALUOp(*Inst);
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
@@ -968,27 +1202,52 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         Inst->RemoveOperand(i);
     }
 
-    // Add the implict and explicit register definitions.
-    if (NewDesc.ImplicitUses) {
-      for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
-        unsigned Reg = NewDesc.ImplicitUses[i];
-        Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
-      }
+    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+      // We are converting these to a BFE, so we need to add the missing
+      // operands for the size and offset.
+      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Size));
+
+      // XXX - Other pointless operands. There are 4, but it seems you only need
+      // 3 to not hit an assertion later in MCInstLower.
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    if (NewDesc.ImplicitDefs) {
-      for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
-        unsigned Reg = NewDesc.ImplicitDefs[i];
-        Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
-      }
+    addDescImplicitUseDef(NewDesc, Inst);
+
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    legalizeOperands(Inst);
-
     // Update the destination register class.
+
     const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
 
-    switch (Inst->getOpcode()) {
+    switch (Opcode) {
       // For target instructions, getOpRegClass just returns the virtual
       // register class associated with the operand, so we need to find an
       // equivalent VGPR register class in order to move the instruction to the
@@ -996,6 +1255,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     case AMDGPU::COPY:
     case AMDGPU::PHI:
     case AMDGPU::REG_SEQUENCE:
+    case AMDGPU::INSERT_SUBREG:
       if (RI.hasVGPRs(NewDstRC))
         continue;
       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
@@ -1010,6 +1270,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
     MRI.replaceRegWith(DstReg, NewDstReg);
 
+    // Legalize the operands
+    legalizeOperands(Inst);
+
     for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
            E = MRI.use_end(); I != E; ++I) {
       MachineInstr &UseMI = *I->getParent();
@@ -1097,6 +1360,24 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
   Worklist.push_back(HiHalf);
 }
 
+void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
+                                        MachineInstr *Inst) const {
+  // Add the implict and explicit register definitions.
+  if (NewDesc.ImplicitUses) {
+    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitUses[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+    }
+  }
+
+  if (NewDesc.ImplicitDefs) {
+    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitDefs[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+    }
+  }
+}
+
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index c537038..7b31a81 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -47,49 +47,52 @@ private:
   void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
                           MachineInstr *Inst, unsigned Opcode) const;
 
+  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
 
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
-  const SIRegisterInfo &getRegisterInfo() const {
+  const SIRegisterInfo &getRegisterInfo() const override {
     return RI;
   }
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
+                           const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
+                            const TargetRegisterInfo *TRI) const override;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
   unsigned commuteOpcode(unsigned Opcode) const;
 
-  virtual MachineInstr *commuteInstruction(MachineInstr *MI,
-                                           bool NewMI=false) const;
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI=false) const override;
 
   bool isTriviallyReMaterializable(const MachineInstr *MI,
-                                   AliasAnalysis *AA = 0) const;
+                                   AliasAnalysis *AA = nullptr) const;
 
-  virtual unsigned getIEQOpcode() const {
+  unsigned getIEQOpcode() const override {
     llvm_unreachable("Unimplemented");
   }
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const;
-  virtual bool isMov(unsigned Opcode) const;
+                              unsigned DstReg, unsigned SrcReg) const override;
+  bool isMov(unsigned Opcode) const override;
 
-  virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
   bool isDS(uint16_t Opcode) const;
   int isMIMG(uint16_t Opcode) const;
   int isSMRD(uint16_t Opcode) const;
@@ -101,8 +104,8 @@ public:
   bool isInlineConstant(const MachineOperand &MO) const;
   bool isLiteralConstant(const MachineOperand &MO) const;
 
-  virtual bool verifyInstruction(const MachineInstr *MI,
-                                 StringRef &ErrInfo) const;
+  bool verifyInstruction(const MachineInstr *MI,
+                         StringRef &ErrInfo) const override;
 
   bool isSALUInstr(const MachineInstr &MI) const;
   static unsigned getVALUOp(const MachineInstr &MI);
@@ -136,32 +139,36 @@ public:
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr *MI) const;
 
+  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+
   /// \brief Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
   void moveToVALU(MachineInstr &MI) const;
 
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const;
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
 
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
 
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                                 MachineBasicBlock::iterator I,
-                                                 unsigned ValueReg,
-                                                 unsigned Address,
-                                                 unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg,
+                                         unsigned Address,
+                                         unsigned OffsetReg) const override;
 
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                                MachineBasicBlock::iterator I,
-                                                unsigned ValueReg,
-                                                unsigned Address,
-                                                unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg,
+                                        unsigned Address,
+                                        unsigned OffsetReg) const override;
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
   void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
               unsigned SavReg, unsigned IndexReg) const;
+
+  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
 };
 
 namespace AMDGPU {
@@ -169,6 +176,7 @@ namespace AMDGPU {
   int getVOPe64(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
+  int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index e05ab65..2242e6d 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -7,23 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
+// in AMDGPUMCInstLower.h
+def SISubtarget {
+  int NONE = -1;
+  int SI = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
-// SMRD takes a 64bit memory address and can only add an 32bit offset
-def SIadd64bit32bit : SDNode<"ISD::ADD",
-  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
->;
-
 def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
-  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
                       [SDNPMayLoad, SDNPMemOperand]
 >;
 
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
   SDTypeProfile<0, 13,
-    [SDTCisVT<0, i128>,   // rsrc(SGPR)
+    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
      SDTCisVT<1, iAny>,   // vdata(VGPR)
      SDTCisVT<2, i32>,    // num_channels(imm)
      SDTCisVT<3, i32>,    // vaddr(VGPR)
@@ -41,13 +43,13 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
 >;
 
 def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
-  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
                        SDTCisVT<3, i32>]>
 >;
 
 class SDSample<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
-                       SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
 >;
 
 def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
@@ -111,14 +113,17 @@ def IMM16bit : PatLeaf <(imm),
   [{return isUInt<16>(N->getZExtValue());}]
 >;
 
+def IMM32bit : PatLeaf <(imm),
+  [{return isUInt<32>(N->getZExtValue());}]
+>;
+
 def mubuf_vaddr_offset : PatFrag<
   (ops node:$ptr, node:$offset, node:$imm_offset),
   (add (add node:$ptr, node:$offset), node:$imm_offset)
 >;
 
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
-  return
-    (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
+  return isInlineImmediate(N);
 }]>;
 
 class SGPRImm <dag frag> : PatLeaf<frag, [{
@@ -138,7 +143,7 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
 }]>;
 
 def FRAMEri32 : Operand<iPTR> {
-  let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
+  let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,15 +202,17 @@ class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   opName#" $dst, $src0, $src1", pattern
 >;
 
-class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
-  op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
 
-class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC <
-  op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
+class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC <
+  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
+  opName#" $dst, $src0, $src1", []>;
+
+class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
+
+class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
 
 class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
   op, (outs SReg_32:$dst), (ins i16imm:$src0),
@@ -221,7 +228,7 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
                         RegisterClass dstClass> {
   def _IMM : SMRD <
     op, 1, (outs dstClass:$dst),
-    (ins baseClass:$sbase, i32imm:$offset),
+    (ins baseClass:$sbase, u32imm:$offset),
     asm#" $dst, $sbase, $offset", []
   >;
 
@@ -245,6 +252,28 @@ class VOP2_REV <string revOp, bit isOrig> {
   bit IsOrig = isOrig;
 }
 
+class SIMCInstr <string pseudo, int subtarget> {
+  string PseudoInstr = pseudo;
+  int Subtarget = subtarget;
+}
+
+multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName> {
+
+  def "" : InstSI <outs, ins, "", pattern>, VOP <opName>,
+           SIMCInstr<OpName, SISubtarget.NONE> {
+    let isPseudo = 1;
+  }
+
+  def _si : VOP3 <op, outs, ins, asm, []>, SIMCInstr<opName, SISubtarget.SI>;
+
+}
+
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printOperandAndMods";
+}
+
 multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
                         string opName, list<dag> pattern> {
 
@@ -256,10 +285,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
   def _e64 : VOP3 <
     {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs drc:$dst),
-    (ins src:$src0,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
+    (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", []
   >, VOP <opName> {
     let src1 = SIOperand.ZERO;
     let src2 = SIOperand.ZERO;
@@ -288,10 +315,10 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
   def _e64 : VOP3 <
     {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs vrc:$dst),
-    (ins arc:$src0, arc:$src1,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let src2 = SIOperand.ZERO;
   }
@@ -316,10 +343,10 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
   def _e64 : VOP3b <
     {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs VReg_32:$dst),
-    (ins VSrc_32:$src0, VSrc_32:$src1,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+    (ins InputMods: $src0_modifiers, VSrc_32:$src0,
+         InputMods:$src1_modifiers, VSrc_32:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let src2 = SIOperand.ZERO;
     /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
@@ -340,15 +367,16 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
   def _e64 : VOP3 <
     {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs SReg_64:$dst),
-    (ins arc:$src0, arc:$src1,
-         InstFlag:$abs, InstFlag:$clamp,
-         InstFlag:$omod, InstFlag:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg",
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         InstFlag:$clamp, InstFlag:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod",
     !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
     let src2 = SIOperand.ZERO;
+    let src2_modifiers = 0;
   }
 }
 
@@ -360,12 +388,13 @@ multiclass VOPC_64 <bits<8> op, string opName,
   ValueType vt = untyped, PatLeaf cond = COND_NULL>
   : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
 
-class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
   op, (outs VReg_32:$dst),
-  (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
-   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
->, VOP <opName>;
+  (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
+   VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2,
+   InstFlag:$clamp, InstFlag:$omod),
+  opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
+>;
 
 class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
@@ -374,10 +403,9 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
 >, VOP <opName> {
 
   let src2 = SIOperand.ZERO;
-  let abs = 0;
+  let src0_modifiers = 0;
   let clamp = 0;
   let omod = 0;
-  let neg = 0;
 }
 
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
@@ -403,7 +431,7 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
 class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset),
   asm#" $vdst, $addr, $offset, [M0]",
   []> {
   let data0 = 0;
@@ -415,7 +443,7 @@ class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
 class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, i8imm:$offset0, i8imm:$offset1),
+  (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1),
   asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
   []> {
   let data0 = 0;
@@ -427,7 +455,7 @@ class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
 class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset),
   asm#" $addr, $data0, $offset [M0]",
   []> {
   let data1 = 0;
@@ -439,7 +467,7 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
 class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i8imm:$offset0, i8imm:$offset1),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1),
   asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
   []> {
   let mayStore = 1;
@@ -450,7 +478,7 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A
 class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset),
   asm#" $vdst, $addr, $data0, $offset, [M0]",
   []> {
 
@@ -462,7 +490,7 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
-  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
    i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
    SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -481,7 +509,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
       let offen = 0, idxen = 0 in {
         def _OFFSET : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
                              i1imm:$slc, i1imm:$tfe),
                              asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
       }
@@ -497,7 +525,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
       let offen = 0, idxen = 1 in {
         def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
                              i1imm:$slc, i1imm:$tfe),
                              asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
       }
@@ -513,7 +541,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
 
     let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
       def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
-                           (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
                            asm#" $vdata, $srsrc + $vaddr + $offset", []>;
     }
   }
@@ -521,7 +549,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
 
 class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
     MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
-                            i16imm:$offset),
+                            u16imm:$offset),
           name#" $vdata, $srsrc + $vaddr + $offset",
          []> {
 
@@ -542,7 +570,7 @@ class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
 class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs regClass:$dst),
-  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
        i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
        i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -677,4 +705,12 @@ def isDS : InstrMapping {
   let ValueCols = [["8"]];
 }
 
+def getMCOpcode : InstrMapping {
+  let FilterClass = "SIMCInstr";
+  let RowFields = ["PseudoInstr"];
+  let ColFields = ["Subtarget"];
+  let KeyCol = [!cast<string>(SISubtarget.NONE)];
+  let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+}
+
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 5232139..500fa78 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -32,9 +32,56 @@ def isSI : Predicate<"Subtarget.getGeneration() "
 def isCI : Predicate<"Subtarget.getGeneration() "
                       ">= AMDGPUSubtarget::SEA_ISLANDS">;
 
+def isCFDepth0 : Predicate<"isCFDepth0()">;
+
 def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
-let Predicates = [isSI] in {
+let SubtargetPredicate = isSI in {
+let OtherPredicates  = [isCFDepth0] in {
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
+
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
+
+} // mayLoad = 1
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
 
@@ -45,7 +92,10 @@ def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
 def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
 } // End isMoveImm = 1
 
-def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
+  [(set i32:$dst, (not i32:$src0))]
+>;
+
 def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
 def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
 def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
@@ -65,8 +115,13 @@ def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
 //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
 def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
 //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
-//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
-//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
+def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
+  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
+  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+>;
+
 ////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
 ////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
 ////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
@@ -99,6 +154,150 @@ def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
 def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
 def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
 def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
+
+def S_CSELECT_B32 : SOP2 <
+  0x0000000a, (outs SReg_32:$dst),
+  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+  []
+>;
+
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+  [(set i64:$dst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
 def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
 def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
 
@@ -147,6 +346,108 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 //def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
 //def EXP : EXP_ <0x00000000, "EXP", []>;
 
+} // End let OtherPredicates = [isCFDepth0]
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+  [(IL_retflag)]> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
+  [(br bb:$target)]> {
+  let isBarrier = 1;
+}
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC0 $target", []
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC1 $target",
+  []
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCZ $target",
+  []
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCNZ $target",
+  []
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECZ $target",
+  []
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECNZ $target",
+  []
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+  [(int_AMDGPU_barrier_local)]
+> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
+  []
+>;
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+
+let Uses = [EXEC] in {
+  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
+  > {
+    let DisableEncoding = "$m0";
+  }
+} // End Uses = [EXEC]
+
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+} // End hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// VOPC Instructions
+//===----------------------------------------------------------------------===//
+
 let isCompare = 1 in {
 
 defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
@@ -403,6 +704,10 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+
 def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
 def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
@@ -427,6 +732,9 @@ def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
 // TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
 // DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
 
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
 
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
@@ -499,6 +807,11 @@ def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
 //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
 //def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
 //def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
 //def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
 //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
 //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
@@ -508,41 +821,10 @@ def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FOR
 def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
 def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
 
-let mayLoad = 1 in {
-
-// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SGPR_32 register class does not include M0
-// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
-
-defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
->;
-
-defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
-  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
->;
-
-defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
-  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
->;
-
-defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
-  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
->;
-
-defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
-  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
->;
-
-} // mayLoad = 1
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
 
-//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
 defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
 defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
@@ -638,8 +920,12 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
-//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
 
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
 
 let neverHasSideEffects = 1, isMoveImm = 1 in {
 defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
@@ -691,8 +977,13 @@ defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
 //defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
 //defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
 //defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
-//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
-//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
+  [(set i32:$dst, (fp_to_uint f64:$src0))]
+>;
+defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
+  [(set f64:$dst, (uint_to_fp i32:$src0))]
+>;
+
 defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
   [(set f32:$dst, (AMDGPUfract f32:$src0))]
 >;
@@ -756,6 +1047,11 @@ defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
 defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
 defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
 
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
 def V_INTERP_P1_F32 : VINTRP <
   0x00000000,
   (outs VReg_32:$dst),
@@ -786,97 +1082,9 @@ def V_INTERP_MOV_F32 : VINTRP <
   let DisableEncoding = "$m0";
 }
 
-//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
-
-let isTerminator = 1 in {
-
-def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
-  [(IL_retflag)]> {
-  let SIMM16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-}
-
-let isBranch = 1 in {
-def S_BRANCH : SOPP <
-  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
-  [(br bb:$target)]> {
-  let isBarrier = 1;
-}
-
-let DisableEncoding = "$scc" in {
-def S_CBRANCH_SCC0 : SOPP <
-  0x00000004, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC0 $target", []
->;
-def S_CBRANCH_SCC1 : SOPP <
-  0x00000005, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC1 $target",
-  []
->;
-} // End DisableEncoding = "$scc"
-
-def S_CBRANCH_VCCZ : SOPP <
-  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCZ $target",
-  []
->;
-def S_CBRANCH_VCCNZ : SOPP <
-  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCNZ $target",
-  []
->;
-
-let DisableEncoding = "$exec" in {
-def S_CBRANCH_EXECZ : SOPP <
-  0x00000008, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECZ $target",
-  []
->;
-def S_CBRANCH_EXECNZ : SOPP <
-  0x00000009, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECNZ $target",
-  []
->;
-} // End DisableEncoding = "$exec"
-
-
-} // End isBranch = 1
-} // End isTerminator = 1
-
-let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
-  [(int_AMDGPU_barrier_local)]
-> {
-  let SIMM16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-}
-
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
-  []
->;
-//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
-//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
-//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
-
-let Uses = [EXEC] in {
-  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
-      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
-  > {
-    let DisableEncoding = "$m0";
-  }
-} // End Uses = [EXEC]
-
-//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
-//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
-//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
-//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
-//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
-//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
-} // End hasSideEffects
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
 
 def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
   (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
@@ -891,18 +1099,11 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
    InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
   "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
   [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
->;
-
-//f32 pattern for V_CNDMASK_B32_e64
-def : Pat <
-  (f32 (select i1:$src2, f32:$src1, f32:$src0)),
-  (V_CNDMASK_B32_e64 $src0, $src1, $src2)
->;
-
-def : Pat <
-  (i32 (trunc i64:$val)),
-  (EXTRACT_SUBREG $val, sub0)
->;
+> {
+  let src0_modifiers = 0;
+  let src1_modifiers = 0;
+  let src2_modifiers = 0;
+}
 
 def V_READLANE_B32 : VOP2 <
   0x00000001,
@@ -946,11 +1147,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
 
 
 defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
-  [(set i32:$dst, (mul I24:$src0, I24:$src1))]
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
 >;
 //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
 defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
-  [(set i32:$dst, (mul U24:$src0, U24:$src1))]
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
 >;
 //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
 
@@ -965,27 +1166,43 @@ defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
 
 defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
 defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>;
+
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
 
-defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
 defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
 
-defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
 
 let hasPostISelHook = 1 in {
 
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
 
 }
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
 
-defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>;
-defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>;
-defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>;
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
 
 } // End isCommutable = 1
 
@@ -1001,14 +1218,18 @@ defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", [], VSrc_32>;
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", [], VSrc_32>;
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
+  [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
+  [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>;
 defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
                               "V_SUB_I32">;
 
 let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", [], VReg_32>;
-defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", [], VReg_32>;
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32",
+  [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32",
+  [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>;
 defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
                                "V_SUBB_U32">;
 } // End Uses = [VCC]
@@ -1023,63 +1244,51 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
 >;
 ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
 ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
 
-def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
-def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
-def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
-  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))]
+defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
+  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+>;
+defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
 >;
-def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
-  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))]
+defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
 >;
 
 } // End neverHasSideEffects
-def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
-def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
-def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
-def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+
+defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
 
 let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
-def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
+defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
   [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
-def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
+defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
   [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
 }
 
-def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
+defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
   [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
-defm : BFIPatterns <V_BFI_B32>;
-def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
   [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
 >;
 def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
   [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
 >;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
-def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
+defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
 
-def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
-def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 ////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
 ////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
 ////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
@@ -1092,9 +1301,9 @@ def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 //def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
 //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
 //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
-def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
 ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
+defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
 def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
 
 def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
@@ -1116,181 +1325,46 @@ def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
 
 } // isCommutable = 1
 
-def : Pat <
-  (fadd f64:$src0, f64:$src1),
-  (V_ADD_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
-  (fmul f64:$src0, f64:$src1),
-  (V_MUL_F64 $src0, $src1, (i64 0))
->;
-
 def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
 
 let isCommutable = 1 in {
 
-def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
-def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
-def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
-def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
 
 } // isCommutable = 1
 
-def : Pat <
-  (mul i32:$src0, i32:$src1),
-  (V_MUL_LO_I32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
-  (mulhu i32:$src0, i32:$src1),
-  (V_MUL_HI_U32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
-  (mulhs i32:$src0, i32:$src1),
-  (V_MUL_HI_I32 $src0, $src1, (i32 0))
->;
-
-def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
 def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
-def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
+defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
 def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
 def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
 
-let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
-  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
->;
-} // End isCommutable = 1
-
-def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
-  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
->;
-
-let Uses = [SCC] in { // Carry in comes from SCC
-let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
-  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End isCommutable = 1
-
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
-  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End Uses = [SCC]
-} // End Defs = [SCC]
-
-def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
-  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
->;
-def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
-  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
->;
-def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
-  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
->;
-def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
-  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
->;
-
-def S_CSELECT_B32 : SOP2 <
-  0x0000000a, (outs SReg_32:$dst),
-  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
-  []
->;
-
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
-
-def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]
->;
-
-def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
-  [(set i64:$dst, (and i64:$src0, i64:$src1))]
->;
-
-def : Pat <
-  (i1 (and i1:$src0, i1:$src1)),
-  (S_AND_B64 $src0, $src1)
->;
-
-def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
->;
-
-def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
-  [(set i64:$dst, (or i64:$src0, i64:$src1))]
->;
-
-def : Pat <
-  (i1 (or i1:$src0, i1:$src1)),
-  (S_OR_B64 $src0, $src1)
->;
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
 
-def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
->;
+let isCodeGenOnly = 1, isPseudo = 1 in {
 
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
-  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+def V_MOV_I1 : InstSI <
+  (outs VReg_1:$dst),
+  (ins i1imm:$src),
+  "", [(set i1:$dst, (imm:$src))]
 >;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
-
-// Use added complexity so these patterns are preferred to the VALU patterns.
-let AddedComplexity = 1 in {
 
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
->;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
-  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
->;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
->;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
-  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
->;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
->;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
-  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+def V_AND_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (and i1:$src0, i1:$src1))]
 >;
 
-} // End AddedComplexity = 1
-
-def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
-def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
-def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
-
-let isCodeGenOnly = 1, isPseudo = 1 in {
-
-def LOAD_CONST : AMDGPUShaderInst <
-  (outs GPRF32:$dst),
-  (ins i32imm:$src),
-  "LOAD_CONST $dst, $src",
-  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+def V_OR_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (or i1:$src0, i1:$src1))]
 >;
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -1301,19 +1375,19 @@ let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
 
 let isBranch = 1, isTerminator = 1 in {
 
-def SI_IF : InstSI <
+def SI_IF: InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$vcc, brtarget:$target),
-  "SI_IF $dst, $vcc, $target",
+  "",
   [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
 >;
 
 def SI_ELSE : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src, brtarget:$target),
-  "SI_ELSE $dst, $src, $target",
-  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> {
-
+  "",
+  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
+> {
   let Constraints = "$src = $dst";
 }
 
@@ -1370,7 +1444,7 @@ let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
 let UseNamedOperandTable = 1 in {
 
-def SI_RegisterLoad : AMDGPUShaderInst <
+def SI_RegisterLoad : InstSI <
   (outs VReg_32:$dst, SReg_64:$temp),
   (ins FRAMEri32:$addr, i32imm:$chan),
   "", []
@@ -1379,7 +1453,7 @@ def SI_RegisterLoad : AMDGPUShaderInst <
   let mayLoad = 1;
 }
 
-class SIRegStore<dag outs> : AMDGPUShaderInst <
+class SIRegStore<dag outs> : InstSI <
   outs,
   (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
   "", []
@@ -1439,8 +1513,33 @@ def V_SUB_F64 : InstSI <
 
 } // end usesCustomInserter
 
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+
+  def _SAVE : InstSI <
+    (outs VReg_32:$dst),
+    (ins sgpr_class:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+  def _RESTORE : InstSI <
+    (outs sgpr_class:$dst),
+    (ins VReg_32:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+}
+
+defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
 } // end IsCodeGenOnly, isPseudo
 
+} // end SubtargetPredicate = SI
+
+let Predicates = [isSI] in {
+
 def : Pat<
   (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
   (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
@@ -1453,7 +1552,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
 >;
 
@@ -1470,40 +1569,116 @@ def : Pat <
   (V_SUB_F64 $src0, $src1)
 >;
 
+//===----------------------------------------------------------------------===//
+// SMRD Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+  // 1. Offset as 8bit DWORD immediate
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
+    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load i64:$sbase),
+    (vt (Instr_IMM $sbase, 0))
+  >;
+}
+
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, imm:$offset),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i1 (xor i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (or i64:$src0, i64:$src1),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0),
+                  (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
+    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1),
+                  (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
+>;
+
+class SextInReg <ValueType vt, int ShiftAmt> : Pat <
+  (sext_inreg i32:$src0, vt),
+  (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
+>;
+
+def : SextInReg <i8, 24>;
+def : SextInReg <i16, 16>;
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
 /* SIsample for simple 1D texture lookup */
 def : Pat <
-  (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
+  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
   (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
     (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
@@ -1692,8 +1867,6 @@ def : BitConvert <i64, v2i32, VReg_64>;
 
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
-def : BitConvert <v4i32, i128,  VReg_128>;
-def : BitConvert <i128, v4i32,  VReg_128>;
 
 def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, SReg_256>;
@@ -1711,10 +1884,18 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
 
+def FCLAMP_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FCLAMP_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
-   0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+  (FCLAMP_SI f32:$src)
 >;
 
 /********** ================================ **********/
@@ -1733,14 +1914,32 @@ def : Pat <
   (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
 >;
 
+def FABS_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FABS_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (fabs f32:$src),
-  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) /* Clear sign bit */
+  (FABS_SI f32:$src)
 >;
 
+def FNEG_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FNEG_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (fneg f32:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Toggle sign bit */
+  (FNEG_SI f32:$src)
 >;
 
 /********** ================== **********/
@@ -1768,30 +1967,10 @@ def : Pat <
 >;
 
 def : Pat <
-  (i1 imm:$imm),
-  (S_MOV_B64 imm:$imm)
->;
-
-def : Pat <
   (i64 InlineImm<i64>:$imm),
   (S_MOV_B64 InlineImm<i64>:$imm)
 >;
 
-// i64 immediates aren't supported in hardware, split it into two 32bit values
-def : Pat <
-  (i64 imm:$imm),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
-    (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
->;
-
-def : Pat <
-  (f64 fpimm:$imm),
-  (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-    (V_MOV_B32_e32 (f32 (LO32f fpimm:$imm))), sub0),
-    (V_MOV_B32_e32 (f32 (HI32f fpimm:$imm))), sub1)
->;
-
 /********** ===================== **********/
 /********** Interpolation Paterns **********/
 /********** ===================== **********/
@@ -1875,21 +2054,9 @@ class Ext32Pat <SDNode ext> : Pat <
 def : Ext32Pat <zext>;
 def : Ext32Pat <anyext>;
 
-// 1. Offset as 8bit DWORD immediate
+// Offset in an 32Bit VGPR
 def : Pat <
-  (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : Pat <
-  (SIload_constant i128:$sbase, imm:$offset),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
->;
-
-// 3. Offset in an 32Bit VGPR
-def : Pat <
-  (SIload_constant i128:$sbase, i32:$voff),
+  (SIload_constant v4i32:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
 >;
 
@@ -1904,18 +2071,44 @@ def : Pat <
 def : Pat <
   (int_SI_tid),
   (V_MBCNT_HI_U32_B32_e32 0xffffffff,
-                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0, 0, 0))
+                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0))
 >;
 
-/********** ================== **********/
-/**********   VOP3 Patterns    **********/
-/********** ================== **********/
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
 
 def : Pat <
-  (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)),
-  (V_MAD_F32 $src0, $src1, $src2)
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (mul i32:$src0, i32:$src1),
+  (V_MUL_LO_I32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+  (mulhu i32:$src0, i32:$src1),
+  (V_MUL_HI_U32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+  (mulhs i32:$src0, i32:$src1),
+  (V_MUL_HI_I32 $src0, $src1, (i32 0))
 >;
 
+defm : BFIPatterns <V_BFI_B32>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
 /********** ======================= **********/
 /**********   Load/Store Patterns   **********/
 /********** ======================= **********/
@@ -1962,41 +2155,6 @@ def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
 def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
            (DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
 
-/********** ================== **********/
-/**********   SMRD Patterns    **********/
-/********** ================== **********/
-
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
-  // 1. Offset as 8bit DWORD immediate
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
-    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
-  >;
-
-  // 2. Offset loaded in an 32bit SGPR
-  def : Pat <
-    (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset)))
-  >;
-
-  // 3. No offset at all
-  def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
-}
-
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-
 //===----------------------------------------------------------------------===//
 // MUBUF Patterns
 //===----------------------------------------------------------------------===//
@@ -2083,7 +2241,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
                              MUBUF bothen> {
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm:$offset, 0, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2091,7 +2249,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm, 1, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2099,7 +2257,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm:$offset, 0, 1, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2107,7 +2265,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, v2i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
                                   imm, 1, 1, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2128,7 +2286,7 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_
 
 // TBUFFER_STORE_FORMAT_*, addr64=0
 class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
-  (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
                    i32:$soffset, imm:$inst_offset, imm:$dfmt,
                    imm:$nfmt, imm:$offen, imm:$idxen,
                    imm:$glc, imm:$slc, imm:$tfe),
@@ -2156,12 +2314,13 @@ defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
 defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
   [(set f64:$dst, (ffloor f64:$src0))]
 >;
+defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64",
+  [(set f64:$dst, (frint f64:$src0))]
+>;
 
-defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", []>;
-
-def V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
-def V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
-def V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
+defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
+defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
+defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
 def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
 
 // XXX - Does this set VCC?
@@ -2248,17 +2407,43 @@ def : Pat<
 >;
 
 //===----------------------------------------------------------------------===//
-// Miscellaneous Patterns
+// Conversion Patterns
 //===----------------------------------------------------------------------===//
 
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+
+// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
+// might not be worth the effort, and will need to expand to shifts when
+// fixing SGPR copies.
+
+// Handle sext_inreg in i64
 def : Pat <
-  (i64 (trunc i128:$x)),
+  (i64 (sext_inreg i64:$src, i1)),
   (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (i32 (EXTRACT_SUBREG $x, sub0)), sub0),
-    (i32 (EXTRACT_SUBREG $x, sub1)), sub1)
+    (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16
+    (S_MOV_B32 -1), sub1)
 >;
 
 def : Pat <
+  (i64 (sext_inreg i64:$src, i8)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i16)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
   (i32 (trunc i64:$a)),
   (EXTRACT_SUBREG $a, sub0)
 >;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index c2f8696..6601f2a 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -67,7 +67,7 @@ private:
   static const unsigned SkipThreshold = 12;
 
   static char ID;
-  const TargetRegisterInfo *TRI;
+  const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@@ -92,11 +92,11 @@ private:
 
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TRI(0), TII(0) { }
+    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI Lower control flow instructions";
   }
 
@@ -427,7 +427,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
   TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
new file mode 100644
index 0000000..738c90b
--- /dev/null
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -0,0 +1,148 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type.  Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SILowerI1Copies() : MachineFunctionPass(ID) {
+    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual const char *getPassName() const override {
+    return "SI Lower il Copies";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
+                      "SI Lower il Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
+                    "SI Lower il Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+  return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_AND_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_OR_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() != AMDGPU::COPY ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+        continue;
+
+
+      const TargetRegisterClass *DstRC =
+          MRI.getRegClass(MI.getOperand(0).getReg());
+      const TargetRegisterClass *SrcRC =
+          MRI.getRegClass(MI.getOperand(1).getReg());
+
+      if (DstRC == &AMDGPU::VReg_1RegClass &&
+          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addImm(-1)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+                 SrcRC == &AMDGPU::VReg_1RegClass) {
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  for (unsigned Reg : I1Defs)
+    MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+
+  return false;
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index ea04346..af60995 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,8 +10,11 @@
 
 
 #include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 
 #define MAX_LANES 64
 
@@ -26,21 +29,57 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PSInputAddr(0),
     SpillTracker() { }
 
-static unsigned createLaneVGPR(MachineRegisterInfo &MRI) {
-  return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
+  unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+
+  // We need to add this register as live out for the function, in order to
+  // have the live range calculated directly.
+  //
+  // When register spilling begins, we have already calculated the live
+  // live intervals for all the registers.  Since we are spilling SGPRs to
+  // VGPRs, we need to update the Lane VGPR's live interval every time we
+  // spill or restore a register.
+  //
+  // Unfortunately, there is no good way to update the live interval as
+  // the TargetInstrInfo callbacks for spilling and restoring don't give
+  // us access to the live interval information.
+  //
+  // We are lucky, though, because the InlineSpiller calls
+  // LiveRangeEdit::calculateRegClassAndHint() which iterates through
+  // all the new register that have been created when restoring a register
+  // and calls LiveIntervals::getInterval(), which creates and computes
+  // the live interval for the newly created register.  However, once this
+  // live intervals is created, it doesn't change and since we usually reuse
+  // the Lane VGPR multiple times, this means any uses after the first aren't
+  // added to the live interval.
+  //
+  // To work around this, we add Lane VGPRs to the functions live out list,
+  // so that we can guarantee its live range will cover all of its uses.
+
+  for (MachineBasicBlock &MBB : *MF) {
+    if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
+      MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
+      return VGPR;
+    }
+  }
+  MF->getFunction()->getContext().emitError(
+      "Could not found S_ENGPGM instrtuction.");
+  return VGPR;
 }
 
-unsigned SIMachineFunctionInfo::RegSpillTracker::getNextLane(MachineRegisterInfo &MRI) {
+unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
+    MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
+  unsigned StartLane = CurrentLane;
+  CurrentLane += NumRegs;
   if (!LaneVGPR) {
-    LaneVGPR = createLaneVGPR(MRI);
+    LaneVGPR = createLaneVGPR(MRI, MF);
   } else {
-    CurrentLane++;
-    if (CurrentLane == MAX_LANES) {
-      CurrentLane = 0;
-      LaneVGPR = createLaneVGPR(MRI);
+    if (CurrentLane >= MAX_LANES) {
+      StartLane = CurrentLane = 0;
+      LaneVGPR = createLaneVGPR(MRI, MF);
     }
   }
-  return CurrentLane;
+  return StartLane;
 }
 
 void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 8dc82a0..96e619b 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -25,7 +25,7 @@ class MachineRegisterInfo;
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
-  virtual void anchor();
+  void anchor() override;
 public:
 
   struct SpilledReg {
@@ -43,7 +43,12 @@ public:
   public:
     unsigned LaneVGPR;
     RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
-    unsigned getNextLane(MachineRegisterInfo &MRI);
+    /// \p NumRegs The number of consecutive registers what need to be spilled.
+    ///            This function will ensure that all registers are stored in
+    ///            the same VGPR.
+    /// \returns The lane to be used for storing the first register.
+    unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
+                          unsigned NumRegs = 1);
     void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
     const SpilledReg& getSpilledReg(unsigned FrameIndex);
     bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 6cef195..c72d549 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -71,13 +71,12 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::SReg_256RegClass
   };
 
-  for (unsigned i = 0, e = sizeof(BaseClasses) /
-                           sizeof(const TargetRegisterClass*); i != e; ++i) {
-    if (BaseClasses[i]->contains(Reg)) {
-      return BaseClasses[i];
+  for (const TargetRegisterClass *BaseClass : BaseClasses) {
+    if (BaseClass->contains(Reg)) {
+      return BaseClass;
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
@@ -113,7 +112,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
     } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
       return &AMDGPU::VReg_512RegClass;
     }
-    return NULL;
+    return nullptr;
 }
 
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -129,3 +128,10 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
     return &AMDGPU::VGPR_32RegClass;
   }
 }
+
+unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
+                                          const TargetRegisterClass *SubRC,
+                                          unsigned Channel) const {
+  unsigned Index = getHWRegIndex(Reg);
+  return SubRC->getRegister(Index + Channel);
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 8148f7f..36b4fcd 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -27,22 +27,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   SIRegisterInfo(AMDGPUTargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  virtual unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                                       MachineFunction &MF) const;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 
   /// \param RC is an AMDIL reg class.
   ///
   /// \returns the SI register class that is equivalent to \p RC.
-  virtual const TargetRegisterClass *
-    getISARegClass(const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass *RC) const override;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
 
-  virtual unsigned getHWRegIndex(unsigned Reg) const;
+  unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
@@ -63,6 +63,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// be returned.
   const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
                                             unsigned SubIdx) const;
+
+  /// \p Channel This is the register channel (e.g. a value from 0-16), not the
+  ///            SubReg index.
+  /// \returns The sub-register of Reg that is in Channel.
+  unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
+                            unsigned Channel) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 65cf311..f1f01de 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
   (add SGPR_64Regs, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>;
 
 def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
@@ -183,14 +183,16 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
   let Size = 96;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
 
 def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+
 //===----------------------------------------------------------------------===//
-//  [SV]Src_* register classes, can have either an immediate or an register
+//  [SV]Src_(32|64) register classes, can have either an immediate or an register
 //===----------------------------------------------------------------------===//
 
 def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
@@ -201,3 +203,9 @@ def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
 
 def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
 
+//===----------------------------------------------------------------------===//
+// SGPR and VGPR register classes
+//===----------------------------------------------------------------------===//
+
+def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128,
+                             (add VReg_128, SReg_128)>;
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 9bf2caf..a0b6907 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -35,13 +35,13 @@ class SITypeRewriter : public FunctionPass,
   static char ID;
   Module *Mod;
   Type *v16i8;
-  Type *i128;
+  Type *v4i32;
 
 public:
   SITypeRewriter() : FunctionPass(ID) { }
-  virtual bool doInitialization(Module &M);
-  virtual bool runOnFunction(Function &F);
-  virtual const char *getPassName() const {
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override {
     return "SI Type Rewriter";
   }
   void visitLoadInst(LoadInst &I);
@@ -56,7 +56,7 @@ char SITypeRewriter::ID = 0;
 bool SITypeRewriter::doInitialization(Module &M) {
   Mod = &M;
   v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
-  i128 = Type::getIntNTy(M.getContext(), 128);
+  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
   return false;
 }
 
@@ -84,7 +84,8 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
   Type *ElemTy = PtrTy->getPointerElementType();
   IRBuilder<> Builder(&I);
   if (ElemTy == v16i8)  {
-    Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+    Value *BitCast = Builder.CreateBitCast(Ptr,
+        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
     LoadInst *Load = Builder.CreateLoad(BitCast);
     SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
     I.getAllMetadataOtherThanDebugLoc(MD);
@@ -99,6 +100,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
 
 void SITypeRewriter::visitCallInst(CallInst &I) {
   IRBuilder<> Builder(&I);
+
   SmallVector <Value*, 8> Args;
   SmallVector <Type*, 8> Types;
   bool NeedToReplace = false;
@@ -107,10 +109,10 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
     Value *Arg = I.getArgOperand(i);
     if (Arg->getType() == v16i8) {
-      Args.push_back(Builder.CreateBitCast(Arg, i128));
-      Types.push_back(i128);
+      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+      Types.push_back(v4i32);
       NeedToReplace = true;
-      Name = Name + ".i128";
+      Name = Name + ".v4i32";
     } else if (Arg->getType()->isVectorTy() &&
                Arg->getType()->getVectorNumElements() == 1 &&
                Arg->getType()->getVectorElementType() ==
@@ -144,12 +146,12 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
 
 void SITypeRewriter::visitBitCast(BitCastInst &I) {
   IRBuilder<> Builder(&I);
-  if (I.getDestTy() != i128) {
+  if (I.getDestTy() != v4i32) {
     return;
   }
 
   if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
-    if (Op->getSrcTy() == i128) {
+    if (Op->getSrcTy() == v4i32) {
       I.replaceAllUsesWith(Op->getOperand(0));
       I.eraseFromParent();
     }
author	Stephen Hines <srhines@google.com>	2014-05-29 02:49:00 -0700
committer	Stephen Hines <srhines@google.com>	2014-05-29 02:49:00 -0700
commit	dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
tree	dcebc53f2b182f145a2e659393bf9a0472cedf23 /lib/Target/R600
parent	220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
download	external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2