319 files changed, 28184 insertions, 4263 deletions
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index bbca228..6ae287a 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -493,11 +493,21 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       return false;
     }
 
-    // These modifiers are not yet supported.
-    case 'p': // The high single-precision register of a VFP double-precision
-              // register.
     case 'e': // The low doubleword register of a NEON quad register.
-    case 'f': // The high doubleword register of a NEON quad register.
+    case 'f': { // The high doubleword register of a NEON quad register.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      unsigned Reg = MI->getOperand(OpNum).getReg();
+      if (!ARM::QPRRegClass.contains(Reg))
+        return true;
+      const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+      unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
+                                       ARM::dsub_0 : ARM::dsub_1);
+      O << ARMInstPrinter::getRegisterName(SubReg);
+      return false;
+    }
+
+    // These modifiers are not yet supported.
     case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
     case 'H': // The highest-numbered register of a pair.
       return true;
@@ -739,14 +749,14 @@ void ARMAsmPrinter::emitAttributes() {
   }
 
   // Signal various FP modes.
-  if (!UnsafeFPMath) {
+  if (!TM.Options.UnsafeFPMath) {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal,
                                ARMBuildAttrs::Allowed);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
                                ARMBuildAttrs::Allowed);
   }
 
-  if (NoInfsFPMath && NoNaNsFPMath)
+  if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
                                ARMBuildAttrs::Allowed);
   else
@@ -759,7 +769,7 @@ void ARMAsmPrinter::emitAttributes() {
   AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
 
   // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
-  if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) {
+  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1);
   }
@@ -1069,7 +1079,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   }
 
   // Try to figure out the unwinding opcode out of src / dst regs.
-  if (MI->getDesc().mayStore()) {
+  if (MI->mayStore()) {
     // Register saves.
     assert(DstReg == ARM::SP &&
            "Only stack pointer as a destination reg is supported");
@@ -1481,11 +1491,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     /// in the function.  The first operand is the ID# for this instruction, the
     /// second is the index into the MachineConstantPool that this is, the third
     /// is the size in bytes of this constant pool entry.
+    /// The required alignment is specified on the basic block holding this MI.
     unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
     unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
 
-    EmitAlignment(2);
-
     // Mark the constant pool entry as data if we're not already in a data
     // region.
     OutStreamer.EmitDataRegion();
@@ -1934,4 +1943,3 @@ extern "C" void LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
   RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
 }
-
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9315348..8bf5475 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -146,7 +146,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
   const MCInstrDesc &MCID = MI->getDesc();
   unsigned NumOps = MCID.getNumOperands();
-  bool isLoad = !MCID.mayStore();
+  bool isLoad = !MI->mayStore();
   const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
   const MachineOperand &Base = MI->getOperand(2);
   const MachineOperand &Offset = MI->getOperand(NumOps-3);
@@ -439,6 +439,22 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
+bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
+  if (MI->isBundle()) {
+    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      int PIdx = I->findFirstPredOperandIdx();
+      if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL)
+        return true;
+    }
+    return false;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+}
+
 bool ARMBaseInstrInfo::
 PredicateInstruction(MachineInstr *MI,
                      const SmallVectorImpl<MachineOperand> &Pred) const {
@@ -491,7 +507,7 @@ bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
                                     std::vector<MachineOperand> &Pred) const {
   // FIXME: This confuses implicit_def with optional CPSR def.
   const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.getImplicitDefs() && !MCID.hasOptionalDef())
+  if (!MCID.getImplicitDefs() && !MI->hasOptionalDef())
     return false;
 
   bool Found = false;
@@ -510,11 +526,10 @@ bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
 /// By default, this returns true for every instruction with a
 /// PredicateOperand.
 bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.isPredicable())
+  if (!MI->isPredicable())
     return false;
 
-  if ((MCID.TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) {
+  if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) {
     ARMFunctionInfo *AFI =
       MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
     return AFI->isThumb2Function();
@@ -548,7 +563,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
     if (MI->isLabel())
       return 0;
-  unsigned Opc = MI->getOpcode();
+    unsigned Opc = MI->getOpcode();
     switch (Opc) {
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
@@ -556,6 +571,8 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     case TargetOpcode::EH_LABEL:
     case TargetOpcode::DBG_VALUE:
       return 0;
+    case TargetOpcode::BUNDLE:
+      return getInstBundleLength(MI);
     case ARM::MOVi16_ga_pcrel:
     case ARM::MOVTi16_ga_pcrel:
     case ARM::t2MOVi16_ga_pcrel:
@@ -593,7 +610,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
         ? 1 : ((Opc == ARM::t2TBH_JT) ? 2 : 4);
       unsigned NumOps = MCID.getNumOperands();
       MachineOperand JTOP =
-        MI->getOperand(NumOps - (MCID.isPredicable() ? 3 : 2));
+        MI->getOperand(NumOps - (MI->isPredicable() ? 3 : 2));
       unsigned JTI = JTOP.getIndex();
       const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
       assert(MJTI != 0);
@@ -622,6 +639,17 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   return 0; // Not reached
 }
 
+unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const {
+  unsigned Size = 0;
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  while (++I != E && I->isInsideBundle()) {
+    assert(!I->isBundle() && "No nested bundle!");
+    Size += GetInstSizeInBytes(&*I);
+  }
+  return Size;
+}
+
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I, DebugLoc DL,
                                    unsigned DestReg, unsigned SrcReg,
@@ -845,7 +873,7 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
 unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
                                                     int &FrameIndex) const {
   const MachineMemOperand *Dummy;
-  return MI->getDesc().mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  return MI->mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
 }
 
 void ARMBaseInstrInfo::
@@ -991,7 +1019,7 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
                                              int &FrameIndex) const {
   const MachineMemOperand *Dummy;
-  return MI->getDesc().mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
 }
 
 bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
@@ -1357,7 +1385,7 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
     return false;
 
   // Terminators and labels can't be scheduled around.
-  if (MI->getDesc().isTerminator() || MI->isLabel())
+  if (MI->isTerminator() || MI->isLabel())
     return true;
 
   // Treat the start of the IT block as a scheduling boundary, but schedule
@@ -1762,8 +1790,7 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
 
   // Check that CPSR isn't set between the comparison instruction and the one we
   // want to change.
-  MachineBasicBlock::const_iterator I = CmpInstr, E = MI,
-    B = MI->getParent()->begin();
+  MachineBasicBlock::iterator I = CmpInstr,E = MI, B = MI->getParent()->begin();
 
   // Early exit if CmpInstr is at the beginning of the BB.
   if (I == B) return false;
@@ -1957,7 +1984,7 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   bool isKill = UseMI->getOperand(OpIdx).isKill();
   unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
   AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(),
-                                      *UseMI, UseMI->getDebugLoc(),
+                                      UseMI, UseMI->getDebugLoc(),
                                       get(NewUseOpc), NewReg)
                               .addReg(Reg1, getKillRegState(isKill))
                               .addImm(SOImmValV1)));
@@ -2332,6 +2359,59 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return UseCycle;
 }
 
+static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr *MI, unsigned Reg,
+                                           unsigned &DefIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_iterator I = MI; ++I;
+  MachineBasicBlock::const_instr_iterator II =
+    llvm::prior(I.getInstrIterator());
+  assert(II->isInsideBundle() && "Empty bundle?");
+
+  int Idx = -1;
+  while (II->isInsideBundle()) {
+    Idx = II->findRegisterDefOperandIdx(Reg, false, true, TRI);
+    if (Idx != -1)
+      break;
+    --II;
+    ++Dist;
+  }
+
+  assert(Idx != -1 && "Cannot find bundled definition!");
+  DefIdx = Idx;
+  return II;
+}
+
+static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr *MI, unsigned Reg,
+                                           unsigned &UseIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_instr_iterator II = MI; ++II;
+  assert(II->isInsideBundle() && "Empty bundle?");
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  // FIXME: This doesn't properly handle multiple uses.
+  int Idx = -1;
+  while (II != E && II->isInsideBundle()) {
+    Idx = II->findRegisterUseOperandIdx(Reg, false, TRI);
+    if (Idx != -1)
+      break;
+    if (II->getOpcode() != ARM::t2IT)
+      ++Dist;
+    ++II;
+  }
+
+  if (Idx == -1) {
+    Dist = 0;
+    return 0;
+  }
+
+  UseIdx = Idx;
+  return II;
+}
+
 int
 ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                              const MachineInstr *DefMI, unsigned DefIdx,
@@ -2340,35 +2420,77 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       DefMI->isRegSequence() || DefMI->isImplicitDef())
     return 1;
 
-  const MCInstrDesc &DefMCID = DefMI->getDesc();
   if (!ItinData || ItinData->isEmpty())
-    return DefMCID.mayLoad() ? 3 : 1;
+    return DefMI->mayLoad() ? 3 : 1;
 
-  const MCInstrDesc &UseMCID = UseMI->getDesc();
+  const MCInstrDesc *DefMCID = &DefMI->getDesc();
+  const MCInstrDesc *UseMCID = &UseMI->getDesc();
   const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
-  if (DefMO.getReg() == ARM::CPSR) {
+  unsigned Reg = DefMO.getReg();
+  if (Reg == ARM::CPSR) {
     if (DefMI->getOpcode() == ARM::FMSTAT) {
       // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
       return Subtarget.isCortexA9() ? 1 : 20;
     }
 
     // CPSR set and branch can be paired in the same cycle.
-    if (UseMCID.isBranch())
+    if (UseMI->isBranch())
       return 0;
+
+    // Otherwise it takes the instruction latency (generally one).
+    int Latency = getInstrLatency(ItinData, DefMI);
+
+    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
+    // its uses. Instructions which are otherwise scheduled between them may
+    // incur a code size penalty (not able to use the CPSR setting 16-bit
+    // instructions).
+    if (Latency > 0 && Subtarget.isThumb2()) {
+      const MachineFunction *MF = DefMI->getParent()->getParent();
+      if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+        --Latency;
+    }
+    return Latency;
   }
 
   unsigned DefAlign = DefMI->hasOneMemOperand()
     ? (*DefMI->memoperands_begin())->getAlignment() : 0;
   unsigned UseAlign = UseMI->hasOneMemOperand()
     ? (*UseMI->memoperands_begin())->getAlignment() : 0;
-  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
-                                  UseMCID, UseIdx, UseAlign);
+
+  unsigned DefAdj = 0;
+  if (DefMI->isBundle()) {
+    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
+    if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
+        DefMI->isRegSequence() || DefMI->isImplicitDef())
+      return 1;
+    DefMCID = &DefMI->getDesc();
+  }
+  unsigned UseAdj = 0;
+  if (UseMI->isBundle()) {
+    unsigned NewUseIdx;
+    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
+                                                   Reg, NewUseIdx, UseAdj);
+    if (NewUseMI) {
+      UseMI = NewUseMI;
+      UseIdx = NewUseIdx;
+      UseMCID = &UseMI->getDesc();
+    }
+  }
+
+  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
+                                  *UseMCID, UseIdx, UseAlign);
+  int Adj = DefAdj + UseAdj;
+  if (Adj) {
+    Latency -= (int)(DefAdj + UseAdj);
+    if (Latency < 1)
+      return 1;
+  }
 
   if (Latency > 1 &&
       (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
-    switch (DefMCID.getOpcode()) {
+    switch (DefMCID->getOpcode()) {
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
@@ -2393,7 +2515,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   }
 
   if (DefAlign < 8 && Subtarget.isCortexA9())
-    switch (DefMCID.getOpcode()) {
+    switch (DefMCID->getOpcode()) {
     default: break;
     case ARM::VLD1q8:
     case ARM::VLD1q16:
@@ -2413,12 +2535,18 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD2q8:
     case ARM::VLD2q16:
     case ARM::VLD2q32:
-    case ARM::VLD2d8_UPD:
-    case ARM::VLD2d16_UPD:
-    case ARM::VLD2d32_UPD:
-    case ARM::VLD2q8_UPD:
-    case ARM::VLD2q16_UPD:
-    case ARM::VLD2q32_UPD:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
     case ARM::VLD3d8:
     case ARM::VLD3d16:
     case ARM::VLD3d32:
@@ -2446,9 +2574,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD1DUPq8:
     case ARM::VLD1DUPq16:
     case ARM::VLD1DUPq32:
-    case ARM::VLD1DUPq8_UPD:
-    case ARM::VLD1DUPq16_UPD:
-    case ARM::VLD1DUPq32_UPD:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
     case ARM::VLD2DUPd8:
     case ARM::VLD2DUPd16:
     case ARM::VLD2DUPd32:
@@ -2580,12 +2711,18 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD2q8Pseudo:
     case ARM::VLD2q16Pseudo:
     case ARM::VLD2q32Pseudo:
-    case ARM::VLD2d8Pseudo_UPD:
-    case ARM::VLD2d16Pseudo_UPD:
-    case ARM::VLD2d32Pseudo_UPD:
-    case ARM::VLD2q8Pseudo_UPD:
-    case ARM::VLD2q16Pseudo_UPD:
-    case ARM::VLD2q32Pseudo_UPD:
+    case ARM::VLD2d8PseudoWB_fixed:
+    case ARM::VLD2d16PseudoWB_fixed:
+    case ARM::VLD2d32PseudoWB_fixed:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2d8PseudoWB_register:
+    case ARM::VLD2d16PseudoWB_register:
+    case ARM::VLD2d32PseudoWB_register:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
     case ARM::VLD3d8Pseudo:
     case ARM::VLD3d16Pseudo:
     case ARM::VLD3d32Pseudo:
@@ -2621,9 +2758,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD1DUPq8Pseudo:
     case ARM::VLD1DUPq16Pseudo:
     case ARM::VLD1DUPq32Pseudo:
-    case ARM::VLD1DUPq8Pseudo_UPD:
-    case ARM::VLD1DUPq16Pseudo_UPD:
-    case ARM::VLD1DUPq32Pseudo_UPD:
+    case ARM::VLD1DUPq8PseudoWB_fixed:
+    case ARM::VLD1DUPq16PseudoWB_fixed:
+    case ARM::VLD1DUPq32PseudoWB_fixed:
+    case ARM::VLD1DUPq8PseudoWB_register:
+    case ARM::VLD1DUPq16PseudoWB_register:
+    case ARM::VLD1DUPq32PseudoWB_register:
     case ARM::VLD2DUPd8Pseudo:
     case ARM::VLD2DUPd16Pseudo:
     case ARM::VLD2DUPd32Pseudo:
@@ -2671,6 +2811,19 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
+unsigned
+ARMBaseInstrInfo::getOutputLatency(const InstrItineraryData *ItinData,
+                                   const MachineInstr *DefMI, unsigned DefIdx,
+                                   const MachineInstr *DepMI) const {
+  unsigned Reg = DefMI->getOperand(DefIdx).getReg();
+  if (DepMI->readsRegister(Reg, &getRegisterInfo()) || !isPredicated(DepMI))
+    return 1;
+
+  // If the second MI is predicated, then there is an implicit use dependency.
+  return getOperandLatency(ItinData, DefMI, DefIdx, DepMI,
+                           DepMI->getNumOperands());
+}
+
 int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                       const MachineInstr *MI,
                                       unsigned *PredCost) const {
@@ -2681,6 +2834,17 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   if (!ItinData || ItinData->isEmpty())
     return 1;
 
+  if (MI->isBundle()) {
+    int Latency = 0;
+    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      if (I->getOpcode() != ARM::t2IT)
+        Latency += getInstrLatency(ItinData, I, PredCost);
+    }
+    return Latency;
+  }
+
   const MCInstrDesc &MCID = MI->getDesc();
   unsigned Class = MCID.getSchedClass();
   unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 0f9f321..68e8208 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -69,10 +69,7 @@ public:
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
   // Predication support.
-  bool isPredicated(const MachineInstr *MI) const {
-    int PIdx = MI->findFirstPredOperandIdx();
-    return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
-  }
+  bool isPredicated(const MachineInstr *MI) const;
 
   ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
     int PIdx = MI->findFirstPredOperandIdx();
@@ -213,12 +210,18 @@ public:
                         SDNode *DefNode, unsigned DefIdx,
                         SDNode *UseNode, unsigned UseIdx) const;
 
+  virtual unsigned getOutputLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr *DefMI, unsigned DefIdx,
+                                    const MachineInstr *DepMI) const;
+
   /// VFP/NEON execution domains.
   std::pair<uint16_t, uint16_t>
   getExecutionDomain(const MachineInstr *MI) const;
   void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
 
 private:
+  unsigned getInstBundleLength(const MachineInstr *MI) const;
+
   int getVLDMDefCycle(const InstrItineraryData *ItinData,
                       const MCInstrDesc &DefMCID,
                       unsigned DefClass,
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 7c42342..8ee6ce2 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -631,7 +631,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // 1. Dynamic stack realignment is explicitly disabled,
   // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
   // 3. There are VLAs in the function and the base pointer is disabled.
-  return (RealignStack && !AFI->isThumb1OnlyFunction() &&
+  return (MF.getTarget().Options.RealignStack && !AFI->isThumb1OnlyFunction() &&
           (!MFI->hasVarSizedObjects() || EnableBasePointer));
 }
 
@@ -649,7 +649,7 @@ needsStackRealignment(const MachineFunction &MF) const {
 bool ARMBaseRegisterInfo::
 cannotEliminateFrame(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  if (DisableFramePointerElim(MF) && MFI->adjustsStack())
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
     return true;
   return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken()
     || needsStackRealignment(MF);
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index d74ccfa..365f0bb 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -401,7 +401,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
       MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
            I != E; ++I)
         emitInstruction(*I);
     }
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 3e3a413..2039d41 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -51,6 +52,43 @@ static cl::opt<bool>
 AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
           cl::desc("Adjust basic block layout to better use TB[BH]"));
 
+static cl::opt<bool>
+AlignConstantIslands("arm-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+/// UnknownPadding - Return the worst case padding that could result from
+/// unknown offset bits.  This does not include alignment padding caused by
+/// known offset bits.
+///
+/// @param LogAlign log2(alignment)
+/// @param KnownBits Number of known low offset bits.
+static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
+  if (KnownBits < LogAlign)
+    return (1u << LogAlign) - (1u << KnownBits);
+  return 0;
+}
+
+/// WorstCaseAlign - Assuming only the low KnownBits bits in Offset are exact,
+/// add padding such that:
+///
+/// 1. The result is aligned to 1 << LogAlign.
+///
+/// 2. No other value of the unknown bits would require more padding.
+///
+/// This may add more padding than is required to satisfy just one of the
+/// constraints.  It is necessary to compute alignment this way to guarantee
+/// that we don't underestimate the padding before an aligned block.  If the
+/// real padding before a block is larger than we think, constant pool entries
+/// may go out of range.
+static inline unsigned WorstCaseAlign(unsigned Offset, unsigned LogAlign,
+                                      unsigned KnownBits) {
+  // Add the worst possible padding that the unknown bits could cause.
+  Offset += UnknownPadding(LogAlign, KnownBits);
+
+  // Then align the result.
+  return RoundUpToAlignment(Offset, 1u << LogAlign);
+}
+
 namespace {
   /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
   /// requires constant pool entries to be scattered among the instructions
@@ -64,16 +102,70 @@ namespace {
   ///   CPE     - A constant pool entry that has been placed somewhere, which
   ///             tracks a list of users.
   class ARMConstantIslands : public MachineFunctionPass {
-    /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed
-    /// by MBB Number.  The two-byte pads required for Thumb alignment are
-    /// counted as part of the following block (i.e., the offset and size for
-    /// a padded block will both be ==2 mod 4).
-    std::vector<unsigned> BBSizes;
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// The offset is always aligned as required by the basic block.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      /// KnownBits - The number of low bits in Offset that are known to be
+      /// exact.  The remaining bits of Offset are an upper bound.
+      uint8_t KnownBits;
+
+      /// Unalign - When non-zero, the block contains instructions (inline asm)
+      /// of unknown size.  The real size may be smaller than Size bytes by a
+      /// multiple of 1 << Unalign.
+      uint8_t Unalign;
+
+      /// PostAlign - When non-zero, the block terminator contains a .align
+      /// directive, so the end of the block is aligned to 1 << PostAlign
+      /// bytes.
+      uint8_t PostAlign;
+
+      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0),
+        PostAlign(0) {}
+
+      /// Compute the number of known offset bits internally to this block.
+      /// This number should be used to predict worst case padding when
+      /// splitting the block.
+      unsigned internalKnownBits() const {
+        return Unalign ? Unalign : KnownBits;
+      }
+
+      /// Compute the offset immediately following this block.  If LogAlign is
+      /// specified, return the offset the successor block will get if it has
+      /// this alignment.
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        unsigned LA = std::max(unsigned(PostAlign), LogAlign);
+        if (!LA)
+          return PO;
+        // Add alignment padding from the terminator.
+        return WorstCaseAlign(PO, LA, internalKnownBits());
+      }
+
+      /// Compute the number of known low bits of postOffset.  If this block
+      /// contains inline asm, the number of known bits drops to the
+      /// instruction alignment.  An aligned terminator may increase the number
+      /// of know bits.
+      /// If LogAlign is given, also consider the alignment of the next block.
+      unsigned postKnownBits(unsigned LogAlign = 0) const {
+        return std::max(std::max(unsigned(PostAlign), LogAlign),
+                        internalKnownBits());
+      }
+    };
 
-    /// BBOffsets - the offset of each MBB in bytes, starting from 0.
-    /// The two-byte pads required for Thumb alignment are counted as part of
-    /// the following block.
-    std::vector<unsigned> BBOffsets;
+    std::vector<BasicBlockInfo> BBInfo;
 
     /// WaterList - A sorted list of basic blocks where islands could be placed
     /// (i.e. blocks that don't fall through to the following block, due
@@ -162,9 +254,8 @@ namespace {
     /// the branch fix up pass.
     bool HasFarJump;
 
-    /// HasInlineAsm - True if the function contains inline assembly.
-    bool HasInlineAsm;
-
+    MachineFunction *MF;
+    MachineConstantPool *MCP;
     const ARMInstrInfo *TII;
     const ARMSubtarget *STI;
     ARMFunctionInfo *AFI;
@@ -182,67 +273,65 @@ namespace {
     }
 
   private:
-    void DoInitialPlacement(MachineFunction &MF,
-                            std::vector<MachineInstr*> &CPEMIs);
+    void DoInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
-    void JumpTableFunctionScan(MachineFunction &MF);
-    void InitialFunctionScan(MachineFunction &MF,
-                             const std::vector<MachineInstr*> &CPEMIs);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void JumpTableFunctionScan();
+    void InitialFunctionScan(const std::vector<MachineInstr*> &CPEMIs);
     MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI);
     void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
-    void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta);
+    void AdjustBBOffsetsAfter(MachineBasicBlock *BB);
     bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
     int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
     bool LookForWater(CPUser&U, unsigned UserOffset, water_iterator &WaterIter);
     void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
                         MachineBasicBlock *&NewMBB);
-    bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex);
+    bool HandleConstantPoolUser(unsigned CPUserIndex);
     void RemoveDeadCPEMI(MachineInstr *CPEMI);
     bool RemoveUnusedCPEntries();
     bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
                       MachineInstr *CPEMI, unsigned Disp, bool NegOk,
                       bool DoDump = false);
     bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water,
-                        CPUser &U);
-    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
-                         unsigned Disp, bool NegativeOK, bool IsSoImm = false);
+                        CPUser &U, unsigned &Growth);
     bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
-    bool FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br);
-    bool FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br);
-    bool FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br);
+    bool FixUpImmediateBr(ImmBranch &Br);
+    bool FixUpConditionalBr(ImmBranch &Br);
+    bool FixUpUnconditionalBr(ImmBranch &Br);
     bool UndoLRSpillRestore();
-    bool OptimizeThumb2Instructions(MachineFunction &MF);
-    bool OptimizeThumb2Branches(MachineFunction &MF);
-    bool ReorderThumb2JumpTables(MachineFunction &MF);
-    bool OptimizeThumb2JumpTables(MachineFunction &MF);
+    bool OptimizeThumb2Instructions();
+    bool OptimizeThumb2Branches();
+    bool ReorderThumb2JumpTables();
+    bool OptimizeThumb2JumpTables();
     MachineBasicBlock *AdjustJTTargetBlockForward(MachineBasicBlock *BB,
                                                   MachineBasicBlock *JTBB);
 
+    void ComputeBlockSize(MachineBasicBlock *MBB);
     unsigned GetOffsetOf(MachineInstr *MI) const;
     void dumpBBs();
-    void verify(MachineFunction &MF);
+    void verify();
+
+    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK, bool IsSoImm = false);
+    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U) {
+      return OffsetIsInRange(UserOffset, TrialOffset,
+                             U.MaxDisp, U.NegOk, U.IsSoImm);
+    }
   };
   char ARMConstantIslands::ID = 0;
 }
 
 /// verify - check BBOffsets, BBSizes, alignment of islands
-void ARMConstantIslands::verify(MachineFunction &MF) {
-  assert(BBOffsets.size() == BBSizes.size());
-  for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i)
-    assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]);
-  if (!isThumb)
-    return;
+void ARMConstantIslands::verify() {
 #ifndef NDEBUG
-  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock *MBB = MBBI;
-    if (!MBB->empty() &&
-        MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
-      unsigned MBBId = MBB->getNumber();
-      assert(HasInlineAsm ||
-             (BBOffsets[MBBId]%4 == 0 && BBSizes[MBBId]%4 == 0) ||
-             (BBOffsets[MBBId]%4 != 0 && BBSizes[MBBId]%4 != 0));
-    }
+    unsigned Align = MBB->getAlignment();
+    unsigned MBBId = MBB->getNumber();
+    assert(BBInfo[MBBId].Offset % (1u << Align) == 0);
+    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
   }
   for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
     CPUser &U = CPUsers[i];
@@ -257,10 +346,16 @@ void ARMConstantIslands::verify(MachineFunction &MF) {
 
 /// print block size and offset information - debugging
 void ARMConstantIslands::dumpBBs() {
-  for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) {
-    DEBUG(errs() << "block " << J << " offset " << BBOffsets[J]
-                 << " size " << BBSizes[J] << "\n");
-  }
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << " kb=" << unsigned(BBI.KnownBits)
+             << " ua=" << unsigned(BBI.Unalign)
+             << " pa=" << unsigned(BBI.PostAlign)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
 }
 
 /// createARMConstantIslandPass - returns an instance of the constpool
@@ -269,34 +364,38 @@ FunctionPass *llvm::createARMConstantIslandPass() {
   return new ARMConstantIslands();
 }
 
-bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
-  MachineConstantPool &MCP = *MF.getConstantPool();
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MCP = mf.getConstantPool();
 
-  TII = (const ARMInstrInfo*)MF.getTarget().getInstrInfo();
-  AFI = MF.getInfo<ARMFunctionInfo>();
-  STI = &MF.getTarget().getSubtarget<ARMSubtarget>();
+  DEBUG(dbgs() << "***** ARMConstantIslands: "
+               << MCP->getConstants().size() << " CP entries, aligned to "
+               << MCP->getConstantPoolAlignment() << " bytes *****\n");
+
+  TII = (const ARMInstrInfo*)MF->getTarget().getInstrInfo();
+  AFI = MF->getInfo<ARMFunctionInfo>();
+  STI = &MF->getTarget().getSubtarget<ARMSubtarget>();
 
   isThumb = AFI->isThumbFunction();
   isThumb1 = AFI->isThumb1OnlyFunction();
   isThumb2 = AFI->isThumb2Function();
 
   HasFarJump = false;
-  HasInlineAsm = false;
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
-  MF.RenumberBlocks();
+  MF->RenumberBlocks();
 
   // Try to reorder and otherwise adjust the block layout to make good use
   // of the TB[BH] instructions.
   bool MadeChange = false;
   if (isThumb2 && AdjustJumpTableBlocks) {
-    JumpTableFunctionScan(MF);
-    MadeChange |= ReorderThumb2JumpTables(MF);
+    JumpTableFunctionScan();
+    MadeChange |= ReorderThumb2JumpTables();
     // Data is out of date, so clear it. It'll be re-computed later.
     T2JumpTables.clear();
     // Blocks may have shifted around. Keep the numbering up to date.
-    MF.RenumberBlocks();
+    MF->RenumberBlocks();
   }
 
   // Thumb1 functions containing constant pools get 4-byte alignment.
@@ -304,16 +403,13 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
 
   // ARM and Thumb2 functions need to be 4-byte aligned.
   if (!isThumb1)
-    MF.EnsureAlignment(2);  // 2 = log2(4)
+    MF->EnsureAlignment(2);  // 2 = log2(4)
 
   // Perform the initial placement of the constant pool entries.  To start with,
   // we put them all at the end of the function.
   std::vector<MachineInstr*> CPEMIs;
-  if (!MCP.isEmpty()) {
-    DoInitialPlacement(MF, CPEMIs);
-    if (isThumb1)
-      MF.EnsureAlignment(2);  // 2 = log2(4)
-  }
+  if (!MCP->isEmpty())
+    DoInitialPlacement(CPEMIs);
 
   /// The next UID to take is the first unused one.
   AFI->initPICLabelUId(CPEMIs.size());
@@ -321,7 +417,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   // Do the initial scan of the function, building up information about the
   // sizes of each block, the location of all the water, and finding all of the
   // constant pool users.
-  InitialFunctionScan(MF, CPEMIs);
+  InitialFunctionScan(CPEMIs);
   CPEMIs.clear();
   DEBUG(dumpBBs());
 
@@ -333,9 +429,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   // is no change.
   unsigned NoCPIters = 0, NoBRIters = 0;
   while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
     bool CPChange = false;
     for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
-      CPChange |= HandleConstantPoolUser(MF, i);
+      CPChange |= HandleConstantPoolUser(i);
     if (CPChange && ++NoCPIters > 30)
       llvm_unreachable("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
@@ -344,9 +441,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
     // appear as "new water" for the next iteration of constant pool placement.
     NewWaterList.clear();
 
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
     bool BRChange = false;
     for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
-      BRChange |= FixUpImmediateBr(MF, ImmBranches[i]);
+      BRChange |= FixUpImmediateBr(ImmBranches[i]);
     if (BRChange && ++NoBRIters > 30)
       llvm_unreachable("Branch Fix Up pass failed to converge!");
     DEBUG(dumpBBs());
@@ -358,10 +456,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
 
   // Shrink 32-bit Thumb2 branch, load, and store instructions.
   if (isThumb2 && !STI->prefers32BitThumb())
-    MadeChange |= OptimizeThumb2Instructions(MF);
+    MadeChange |= OptimizeThumb2Instructions();
 
   // After a while, this might be made debug-only, but it is not expensive.
-  verify(MF);
+  verify();
 
   // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
   // undo the spill / restore of LR if possible.
@@ -376,10 +474,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  DEBUG(errs() << '\n'; dumpBBs());
+  DEBUG(dbgs() << '\n'; dumpBBs());
 
-  BBSizes.clear();
-  BBOffsets.clear();
+  BBInfo.clear();
   WaterList.clear();
   CPUsers.clear();
   CPEntries.clear();
@@ -392,37 +489,65 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
 
 /// DoInitialPlacement - Perform the initial placement of the constant pool
 /// entries.  To start with, we put them all at the end of the function.
-void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
-                                        std::vector<MachineInstr*> &CPEMIs) {
+void
+ARMConstantIslands::DoInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // Create the basic block to hold the CPE's.
-  MachineBasicBlock *BB = MF.CreateMachineBasicBlock();
-  MF.push_back(BB);
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->EnsureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
 
   // Add all of the constants from the constant pool to the end block, use an
   // identity mapping of CPI's to CPE's.
-  const std::vector<MachineConstantPoolEntry> &CPs =
-    MF.getConstantPool()->getConstants();
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const TargetData &TD = *MF.getTarget().getTargetData();
+  const TargetData &TD = *MF->getTarget().getTargetData();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
-    // Verify that all constant pool entries are a multiple of 4 bytes.  If not,
-    // we would have to pad them out or something so that instructions stay
-    // aligned.
-    assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!");
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
     MachineInstr *CPEMI =
-      BuildMI(BB, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
         .addImm(i).addConstantPoolIndex(i).addImm(Size);
     CPEMIs.push_back(CPEMI);
 
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a < MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+
     // Add a new CPEntry, but no corresponding CPUser yet.
     std::vector<CPEntry> CPEs;
     CPEs.push_back(CPEntry(CPEMI, i));
     CPEntries.push_back(CPEs);
     ++NumCPEs;
-    DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i
-                 << "\n");
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function\n");
   }
+  DEBUG(BB->dump());
 }
 
 /// BBHasFallthrough - Return true if the specified basic block can fallthrough
@@ -458,17 +583,33 @@ ARMConstantIslands::CPEntry
   return NULL;
 }
 
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  assert(CPEMI && CPEMI->getOpcode() == ARM::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
 /// JumpTableFunctionScan - Do a scan of the function, building up
 /// information about the sizes of each block and the locations of all
 /// the jump tables.
-void ARMConstantIslands::JumpTableFunctionScan(MachineFunction &MF) {
-  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+void ARMConstantIslands::JumpTableFunctionScan() {
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock &MBB = *MBBI;
 
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I)
-      if (I->getDesc().isBranch() && I->getOpcode() == ARM::t2BR_JT)
+      if (I->isBranch() && I->getOpcode() == ARM::t2BR_JT)
         T2JumpTables.push_back(I);
   }
 }
@@ -476,23 +617,27 @@ void ARMConstantIslands::JumpTableFunctionScan(MachineFunction &MF) {
 /// InitialFunctionScan - Do the initial scan of the function, building up
 /// information about the sizes of each block, the location of all the water,
 /// and finding all of the constant pool users.
-void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
-                                 const std::vector<MachineInstr*> &CPEMIs) {
-  // First thing, see if the function has any inline assembly in it. If so,
-  // we have to be conservative about alignment assumptions, as we don't
-  // know for sure the size of any instructions in the inline assembly.
-  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I)
-      if (I->getOpcode() == ARM::INLINEASM)
-        HasInlineAsm = true;
-  }
+void ARMConstantIslands::
+InitialFunctionScan(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    ComputeBlockSize(I);
+
+  // The known bits of the entry block offset are determined by the function
+  // alignment.
+  BBInfo.front().KnownBits = MF->getAlignment();
+
+  // Compute block offsets and known bits.
+  AdjustBBOffsetsAfter(MF->begin());
 
   // Now go back through the instructions and build up our data structures.
-  unsigned Offset = 0;
-  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock &MBB = *MBBI;
 
@@ -501,16 +646,13 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
     if (!BBHasFallthrough(&MBB))
       WaterList.push_back(&MBB);
 
-    unsigned MBBSize = 0;
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
       if (I->isDebugValue())
         continue;
-      // Add instruction size to MBBSize.
-      MBBSize += TII->GetInstSizeInBytes(I);
 
       int Opc = I->getOpcode();
-      if (I->getDesc().isBranch()) {
+      if (I->isBranch()) {
         bool isCond = false;
         unsigned Bits = 0;
         unsigned Scale = 1;
@@ -518,18 +660,6 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
         switch (Opc) {
         default:
           continue;  // Ignore other JT branches
-        case ARM::tBR_JTr:
-          // A Thumb1 table jump may involve padding; for the offsets to
-          // be right, functions containing these must be 4-byte aligned.
-          // tBR_JTr expands to a mov pc followed by .align 2 and then the jump
-          // table entries. So this code checks whether offset of tBR_JTr + 2
-          // is aligned.  That is held in Offset+MBBSize, which already has
-          // 2 added in for the size of the mov pc instruction.
-          MF.EnsureAlignment(2U);
-          if ((Offset+MBBSize)%4 != 0 || HasInlineAsm)
-            // FIXME: Add a pseudo ALIGN instruction instead.
-            MBBSize += 2;           // padding
-          continue;   // Does not get an entry in ImmBranches
         case ARM::t2BR_JT:
           T2JumpTables.push_back(I);
           continue;   // Does not get an entry in ImmBranches
@@ -647,18 +777,30 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
           break;
         }
     }
+  }
+}
 
-    // In thumb mode, if this block is a constpool island, we may need padding
-    // so it's aligned on 4 byte boundary.
-    if (isThumb &&
-        !MBB.empty() &&
-        MBB.begin()->getOpcode() == ARM::CONSTPOOL_ENTRY &&
-        ((Offset%4) != 0 || HasInlineAsm))
-      MBBSize += 2;
-
-    BBSizes.push_back(MBBSize);
-    BBOffsets.push_back(Offset);
-    Offset += MBBSize;
+/// ComputeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void ARMConstantIslands::ComputeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+  BBI.Unalign = 0;
+  BBI.PostAlign = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    BBI.Size += TII->GetInstSizeInBytes(I);
+    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
+    // The actual size may be smaller, but still a multiple of the instr size.
+    if (I->isInlineAsm())
+      BBI.Unalign = isThumb ? 1 : 2;
+  }
+
+  // tBR_JTr contains a .align 2 directive.
+  if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
+    BBI.PostAlign = 2;
+    MBB->getParent()->EnsureAlignment(2);
   }
 }
 
@@ -671,14 +813,7 @@ unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const {
   // The offset is composed of two things: the sum of the sizes of all MBB's
   // before this instruction's block, and the offset from the start of the block
   // it is in.
-  unsigned Offset = BBOffsets[MBB->getNumber()];
-
-  // If we're looking for a CONSTPOOL_ENTRY in Thumb, see if this block has
-  // alignment padding, and compensate if so.
-  if (isThumb &&
-      MI->getOpcode() == ARM::CONSTPOOL_ENTRY &&
-      (Offset%4 != 0 || HasInlineAsm))
-    Offset += 2;
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
 
   // Sum instructions before MI in MBB.
   for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) {
@@ -702,12 +837,9 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
   // Renumber the MBB's to keep them consecutive.
   NewBB->getParent()->RenumberBlocks(NewBB);
 
-  // Insert a size into BBSizes to align it properly with the (newly
+  // Insert an entry into BBInfo to align it properly with the (newly
   // renumbered) block numbers.
-  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
-
-  // Likewise for BBOffsets.
-  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
 
   // Next, update WaterList.  Specifically, we need to add NewMBB as having
   // available water after it.
@@ -723,13 +855,12 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
 /// account for this change and returns the newly created block.
 MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   MachineBasicBlock *OrigBB = MI->getParent();
-  MachineFunction &MF = *OrigBB->getParent();
 
   // Create a new MBB for the code after the OrigBB.
   MachineBasicBlock *NewBB =
-    MF.CreateMachineBasicBlock(OrigBB->getBasicBlock());
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
   MachineFunction::iterator MBBI = OrigBB; ++MBBI;
-  MF.insert(MBBI, NewBB);
+  MF->insert(MBBI, NewBB);
 
   // Splice the instructions starting with MI over to NewBB.
   NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
@@ -747,16 +878,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   ++NumSplit;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
-  while (!OrigBB->succ_empty()) {
-    MachineBasicBlock *Succ = *OrigBB->succ_begin();
-    OrigBB->removeSuccessor(Succ);
-    NewBB->addSuccessor(Succ);
-
-    // This pass should be run after register allocation, so there should be no
-    // PHI nodes to update.
-    assert((Succ->empty() || !Succ->begin()->isPHI())
-           && "PHI nodes should be eliminated by now!");
-  }
+  NewBB->transferSuccessors(OrigBB);
 
   // OrigBB branches to NewBB.
   OrigBB->addSuccessor(NewBB);
@@ -764,14 +886,11 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // Update internal data structures to account for the newly inserted MBB.
   // This is almost the same as UpdateForInsertedWaterBlock, except that
   // the Water goes after OrigBB, not NewBB.
-  MF.RenumberBlocks(NewBB);
+  MF->RenumberBlocks(NewBB);
 
-  // Insert a size into BBSizes to align it properly with the (newly
+  // Insert an entry into BBInfo to align it properly with the (newly
   // renumbered) block numbers.
-  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
-
-  // Likewise for BBOffsets.
-  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
 
   // Next, update WaterList.  Specifically, we need to add OrigMBB as having
   // available water after it (but not if it's already there, which happens
@@ -787,54 +906,19 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
     WaterList.insert(IP, OrigBB);
   NewWaterList.insert(OrigBB);
 
-  unsigned OrigBBI = OrigBB->getNumber();
-  unsigned NewBBI = NewBB->getNumber();
-
-  int delta = isThumb1 ? 2 : 4;
-
   // Figure out how large the OrigBB is.  As the first half of the original
   // block, it cannot contain a tablejump.  The size includes
   // the new jump we added.  (It should be possible to do this without
   // recounting everything, but it's very confusing, and this is rarely
   // executed.)
-  unsigned OrigBBSize = 0;
-  for (MachineBasicBlock::iterator I = OrigBB->begin(), E = OrigBB->end();
-       I != E; ++I)
-    OrigBBSize += TII->GetInstSizeInBytes(I);
-  BBSizes[OrigBBI] = OrigBBSize;
-
-  // ...and adjust BBOffsets for NewBB accordingly.
-  BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI];
+  ComputeBlockSize(OrigBB);
 
   // Figure out how large the NewMBB is.  As the second half of the original
   // block, it may contain a tablejump.
-  unsigned NewBBSize = 0;
-  for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end();
-       I != E; ++I)
-    NewBBSize += TII->GetInstSizeInBytes(I);
-  // Set the size of NewBB in BBSizes.  It does not include any padding now.
-  BBSizes[NewBBI] = NewBBSize;
-
-  MachineInstr* ThumbJTMI = prior(NewBB->end());
-  if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) {
-    // We've added another 2-byte instruction before this tablejump, which
-    // means we will always need padding if we didn't before, and vice versa.
-
-    // The original offset of the jump instruction was:
-    unsigned OrigOffset = BBOffsets[OrigBBI] + BBSizes[OrigBBI] - delta;
-    if (OrigOffset%4 == 0) {
-      // We had padding before and now we don't.  No net change in code size.
-      delta = 0;
-    } else {
-      // We didn't have padding before and now we do.
-      BBSizes[NewBBI] += 2;
-      delta = 4;
-    }
-  }
+  ComputeBlockSize(NewBB);
 
   // All BBOffsets following these blocks must be modified.
-  if (delta)
-    AdjustBBOffsetsAfter(NewBB, delta);
+  AdjustBBOffsetsAfter(OrigBB);
 
   return NewBB;
 }
@@ -882,19 +966,44 @@ bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
 
 /// WaterIsInRange - Returns true if a CPE placed after the specified
 /// Water (a basic block) will be in range for the specific MI.
-
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
 bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
-                                        MachineBasicBlock* Water, CPUser &U) {
-  unsigned MaxDisp = U.MaxDisp;
-  unsigned CPEOffset = BBOffsets[Water->getNumber()] +
-                       BBSizes[Water->getNumber()];
-
-  // If the CPE is to be inserted before the instruction, that will raise
-  // the offset of the instruction.
-  if (CPEOffset < UserOffset)
-    UserOffset += U.CPEMI->getOperand(2).getImm();
-
-  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, U.NegOk, U.IsSoImm);
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water;
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction.  Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return OffsetIsInRange(UserOffset, CPEOffset, U);
 }
 
 /// CPEIsInRange - Returns true if the distance between specific MI and
@@ -903,14 +1012,20 @@ bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
                                       MachineInstr *CPEMI, unsigned MaxDisp,
                                       bool NegOk, bool DoDump) {
   unsigned CPEOffset  = GetOffsetOf(CPEMI);
-  assert((CPEOffset%4 == 0 || HasInlineAsm) && "Misaligned CPE");
+  assert(CPEOffset % 4 == 0 && "Misaligned CPE");
 
   if (DoDump) {
-    DEBUG(errs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
-                 << " max delta=" << MaxDisp
-                 << " insn address=" << UserOffset
-                 << " CPE address=" << CPEOffset
-                 << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI);
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
   }
 
   return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
@@ -933,55 +1048,17 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
 }
 #endif // NDEBUG
 
-void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
-                                              int delta) {
-  MachineFunction::iterator MBBI = BB; MBBI = llvm::next(MBBI);
-  for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs();
-      i < e; ++i) {
-    BBOffsets[i] += delta;
-    // If some existing blocks have padding, adjust the padding as needed, a
-    // bit tricky.  delta can be negative so don't use % on that.
-    if (!isThumb)
-      continue;
-    MachineBasicBlock *MBB = MBBI;
-    if (!MBB->empty() && !HasInlineAsm) {
-      // Constant pool entries require padding.
-      if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
-        unsigned OldOffset = BBOffsets[i] - delta;
-        if ((OldOffset%4) == 0 && (BBOffsets[i]%4) != 0) {
-          // add new padding
-          BBSizes[i] += 2;
-          delta += 2;
-        } else if ((OldOffset%4) != 0 && (BBOffsets[i]%4) == 0) {
-          // remove existing padding
-          BBSizes[i] -= 2;
-          delta -= 2;
-        }
-      }
-      // Thumb1 jump tables require padding.  They should be at the end;
-      // following unconditional branches are removed by AnalyzeBranch.
-      // tBR_JTr expands to a mov pc followed by .align 2 and then the jump
-      // table entries. So this code checks whether offset of tBR_JTr
-      // is aligned; if it is, the offset of the jump table following the
-      // instruction will not be aligned, and we need padding.
-      MachineInstr *ThumbJTMI = prior(MBB->end());
-      if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) {
-        unsigned NewMIOffset = GetOffsetOf(ThumbJTMI);
-        unsigned OldMIOffset = NewMIOffset - delta;
-        if ((OldMIOffset%4) == 0 && (NewMIOffset%4) != 0) {
-          // remove existing padding
-          BBSizes[i] -= 2;
-          delta -= 2;
-        } else if ((OldMIOffset%4) != 0 && (NewMIOffset%4) == 0) {
-          // add new padding
-          BBSizes[i] += 2;
-          delta += 2;
-        }
-      }
-      if (delta==0)
-        return;
-    }
-    MBBI = llvm::next(MBBI);
+void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  for(unsigned i = BB->getNumber() + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
+    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+    // This is where block i begins.
+    BBInfo[i].Offset = Offset;
+    BBInfo[i].KnownBits = KnownBits;
   }
 }
 
@@ -1016,7 +1093,7 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
 
   // Check to see if the CPE is already in-range.
   if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, U.NegOk, true)) {
-    DEBUG(errs() << "In range\n");
+    DEBUG(dbgs() << "In range\n");
     return 1;
   }
 
@@ -1031,7 +1108,7 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
     if (CPEs[i].CPEMI == NULL)
       continue;
     if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, U.NegOk)) {
-      DEBUG(errs() << "Replacing CPE#" << CPI << " with CPE#"
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
                    << CPEs[i].CPI << "\n");
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
@@ -1079,10 +1156,9 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
   if (WaterList.empty())
     return false;
 
-  bool FoundWaterThatWouldPad = false;
-  water_iterator IPThatWouldPad;
-  for (water_iterator IP = prior(WaterList.end()),
-         B = WaterList.begin();; --IP) {
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = prior(WaterList.end()), B = WaterList.begin();;
+       --IP) {
     MachineBasicBlock* WaterBB = *IP;
     // Check if water is in range and is either at a lower address than the
     // current "high water mark" or a new water block that was created since
@@ -1092,31 +1168,24 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
     // should be relatively uncommon and when it does happen, we want to be
     // sure to take advantage of it for all the CPEs near that block, so that
     // we don't insert more branches than necessary.
-    if (WaterIsInRange(UserOffset, WaterBB, U) &&
+    unsigned Growth;
+    if (WaterIsInRange(UserOffset, WaterBB, U, Growth) &&
         (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
-         NewWaterList.count(WaterBB))) {
-      unsigned WBBId = WaterBB->getNumber();
-      if (isThumb &&
-          (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) {
-        // This is valid Water, but would introduce padding.  Remember
-        // it in case we don't find any Water that doesn't do this.
-        if (!FoundWaterThatWouldPad) {
-          FoundWaterThatWouldPad = true;
-          IPThatWouldPad = IP;
-        }
-      } else {
-        WaterIter = IP;
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
         return true;
-      }
     }
     if (IP == B)
       break;
   }
-  if (FoundWaterThatWouldPad) {
-    WaterIter = IPThatWouldPad;
-    return true;
-  }
-  return false;
+  return BestGrowth != ~0u;
 }
 
 /// CreateNewWater - No existing WaterList entry will work for
@@ -1132,114 +1201,143 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
   MachineBasicBlock *UserMBB = UserMI->getParent();
-  unsigned OffsetOfNextBlock = BBOffsets[UserMBB->getNumber()] +
-                               BBSizes[UserMBB->getNumber()];
-  assert(OffsetOfNextBlock== BBOffsets[UserMBB->getNumber()+1]);
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
 
   // If the block does not end in an unconditional branch already, and if the
   // end of the block is within range, make new water there.  (The addition
   // below is for the unconditional branch we will be adding: 4 bytes on ARM +
   // Thumb2, 2 on Thumb1.  Possible Thumb1 alignment padding is allowed for
   // inside OffsetIsInRange.
-  if (BBHasFallthrough(UserMBB) &&
-      OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb1 ? 2: 4),
-                      U.MaxDisp, U.NegOk, U.IsSoImm)) {
-    DEBUG(errs() << "Split at end of block\n");
-    if (&UserMBB->back() == UserMI)
-      assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!");
-    NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
-    // Add an unconditional branch from UserMBB to fallthrough block.
-    // Record it for branch lengthening; this new branch will not get out of
-    // range, but if the preceding conditional branch is out of range, the
-    // targets will be exchanged, and the altered branch may be out of
-    // range, so the machinery has to know about it.
-    int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
-    if (!isThumb)
-      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
-    else
-      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
-              .addImm(ARMCC::AL).addReg(0);
-    unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
-    ImmBranches.push_back(ImmBranch(&UserMBB->back(),
-                          MaxDisp, false, UncondBr));
-    int delta = isThumb1 ? 2 : 4;
-    BBSizes[UserMBB->getNumber()] += delta;
-    AdjustBBOffsetsAfter(UserMBB, delta);
-  } else {
-    // What a big block.  Find a place within the block to split it.
-    // This is a little tricky on Thumb1 since instructions are 2 bytes
-    // and constant pool entries are 4 bytes: if instruction I references
-    // island CPE, and instruction I+1 references CPE', it will
-    // not work well to put CPE as far forward as possible, since then
-    // CPE' cannot immediately follow it (that location is 2 bytes
-    // farther away from I+1 than CPE was from I) and we'd need to create
-    // a new island.  So, we make a first guess, then walk through the
-    // instructions between the one currently being looked at and the
-    // possible insertion point, and make sure any other instructions
-    // that reference CPEs will be able to use the same island area;
-    // if not, we back up the insertion point.
-
-    // The 4 in the following is for the unconditional branch we'll be
-    // inserting (allows for long branch on Thumb1).  Alignment of the
-    // island is handled inside OffsetIsInRange.
-    unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4;
-    // This could point off the end of the block if we've already got
-    // constant pool entries following this block; only the last one is
-    // in the water list.  Back past any possible branches (allow for a
-    // conditional and a maximally long unconditional).
-    if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1])
-      BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] -
-                              (isThumb1 ? 6 : 8);
-    unsigned EndInsertOffset = BaseInsertOffset +
-           CPEMI->getOperand(2).getImm();
-    MachineBasicBlock::iterator MI = UserMI;
-    ++MI;
-    unsigned CPUIndex = CPUserIndex+1;
-    unsigned NumCPUsers = CPUsers.size();
-    MachineInstr *LastIT = 0;
-    for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
-         Offset < BaseInsertOffset;
-         Offset += TII->GetInstSizeInBytes(MI),
-           MI = llvm::next(MI)) {
-      if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
-        CPUser &U = CPUsers[CPUIndex];
-        if (!OffsetIsInRange(Offset, EndInsertOffset,
-                             U.MaxDisp, U.NegOk, U.IsSoImm)) {
-          BaseInsertOffset -= (isThumb1 ? 2 : 4);
-          EndInsertOffset  -= (isThumb1 ? 2 : 4);
-        }
-        // This is overly conservative, as we don't account for CPEMIs
-        // being reused within the block, but it doesn't matter much.
-        EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm();
-        CPUIndex++;
-      }
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = isThumb1 ? 2 : 4;
+    // End of UserBlock after adding a branch.
+    unsigned UserBlockEnd = UserBBI.postOffset() + Delta;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = WorstCaseAlign(UserBlockEnd, CPELogAlign,
+                                        UserBBI.postKnownBits());
+
+    if (OffsetIsInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
+      if (!isThumb)
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      else
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
+          .addImm(ARMCC::AL).addReg(0);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      AdjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
 
-      // Remember the last IT instruction.
-      if (MI->getOpcode() == ARM::t2IT)
-        LastIT = MI;
+  // What a big block.  Find a place within the block to split it.  This is a
+  // little tricky on Thumb1 since instructions are 2 bytes and constant pool
+  // entries are 4 bytes: if instruction I references island CPE, and
+  // instruction I+1 references CPE', it will not work well to put CPE as far
+  // forward as possible, since then CPE' cannot immediately follow it (that
+  // location is 2 bytes farther away from I+1 than CPE was from I) and we'd
+  // need to create a new island.  So, we make a first guess, then walk through
+  // the instructions between the one currently being looked at and the
+  // possible insertion point, and make sure any other instructions that
+  // reference CPEs will be able to use the same island area; if not, we back
+  // up the insertion point.
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then
+  // WorstCaseAlign to LogAlign.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned KnownBits = UserBBI.internalKnownBits();
+  unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+  unsigned BaseInsertOffset = UserOffset + U.MaxDisp;
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // Account for alignment and unknown padding.
+  BaseInsertOffset &= ~((1u << LogAlign) - 1);
+  BaseInsertOffset -= UPad;
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // (allows for long branch on Thumb1).  Alignment of the island is handled
+  // inside OffsetIsInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign
+               << " kb=" << KnownBits
+               << " up=" << UPad << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset >= BBInfo[UserMBB->getNumber()+1].Offset)
+    BaseInsertOffset = BBInfo[UserMBB->getNumber()+1].Offset -
+      (isThumb1 ? 6 : 8);
+  unsigned EndInsertOffset =
+    WorstCaseAlign(BaseInsertOffset + 4, LogAlign, KnownBits) +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  MachineInstr *LastIT = 0;
+  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->GetInstSizeInBytes(MI),
+       MI = llvm::next(MI)) {
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!OffsetIsInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset = RoundUpToAlignment(EndInsertOffset,
+                                           1u << getCPELogAlign(U.CPEMI)) +
+        U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
     }
 
-    DEBUG(errs() << "Split in middle of big block\n");
-    --MI;
+    // Remember the last IT instruction.
+    if (MI->getOpcode() == ARM::t2IT)
+      LastIT = MI;
+  }
 
-    // Avoid splitting an IT block.
-    if (LastIT) {
-      unsigned PredReg = 0;
-      ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg);
-      if (CC != ARMCC::AL)
-        MI = LastIT;
-    }
-    NewMBB = SplitBlockBeforeInstr(MI);
+  --MI;
+
+  // Avoid splitting an IT block.
+  if (LastIT) {
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg);
+    if (CC != ARMCC::AL)
+      MI = LastIT;
   }
+  NewMBB = SplitBlockBeforeInstr(MI);
 }
 
 /// HandleConstantPoolUser - Analyze the specified user, checking to see if it
 /// is out-of-range.  If so, pick up the constant pool value and move it some
 /// place in-range.  Return true if we changed any addresses (thus must run
 /// another pass of branch lengthening), false otherwise.
-bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
-                                                unsigned CPUserIndex) {
+bool ARMConstantIslands::HandleConstantPoolUser(unsigned CPUserIndex) {
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
@@ -1260,11 +1358,11 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   unsigned ID = AFI->createPICLabelUId();
 
   // Look for water where we can place this CPE.
-  MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
   MachineBasicBlock *NewMBB;
   water_iterator IP;
   if (LookForWater(U, UserOffset, IP)) {
-    DEBUG(errs() << "found water in range\n");
+    DEBUG(dbgs() << "Found water in range\n");
     MachineBasicBlock *WaterBB = *IP;
 
     // If the original WaterList entry was "new water" on this iteration,
@@ -1279,7 +1377,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
 
   } else {
     // No water found.
-    DEBUG(errs() << "No water found\n");
+    DEBUG(dbgs() << "No water found\n");
     CreateNewWater(CPUserIndex, UserOffset, NewMBB);
 
     // SplitBlockBeforeInstr adds to WaterList, which is important when it is
@@ -1304,7 +1402,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
     WaterList.erase(IP);
 
   // Okay, we know we can put an island before NewMBB now, do it!
-  MF.insert(NewMBB, NewIsland);
+  MF->insert(NewMBB, NewIsland);
 
   // Update internal data structures to account for the newly inserted MBB.
   UpdateForInsertedWaterBlock(NewIsland);
@@ -1320,13 +1418,12 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
   ++NumCPEs;
 
-  BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()];
-  // Compensate for .align 2 in thumb mode.
-  if (isThumb && (BBOffsets[NewIsland->getNumber()]%4 != 0 || HasInlineAsm))
-    Size += 2;
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
   // Increase the size of the island block to account for the new entry.
-  BBSizes[NewIsland->getNumber()] += Size;
-  AdjustBBOffsetsAfter(NewIsland, Size);
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  AdjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
 
   // Finally, change the CPI in the instruction operand to be ID.
   for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1335,8 +1432,8 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
       break;
     }
 
-  DEBUG(errs() << "  Moved CPE to #" << ID << " CPI=" << CPI
-           << '\t' << *UserMI);
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
 
   return true;
 }
@@ -1347,19 +1444,18 @@ void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
   MachineBasicBlock *CPEBB = CPEMI->getParent();
   unsigned Size = CPEMI->getOperand(2).getImm();
   CPEMI->eraseFromParent();
-  BBSizes[CPEBB->getNumber()] -= Size;
+  BBInfo[CPEBB->getNumber()].Size -= Size;
   // All succeeding offsets have the current size value added in, fix this.
   if (CPEBB->empty()) {
-    // In thumb1 mode, the size of island may be padded by two to compensate for
-    // the alignment requirement.  Then it will now be 2 when the block is
-    // empty, so fix this.
-    // All succeeding offsets have the current size value added in, fix this.
-    if (BBSizes[CPEBB->getNumber()] != 0) {
-      Size += BBSizes[CPEBB->getNumber()];
-      BBSizes[CPEBB->getNumber()] = 0;
-    }
-  }
-  AdjustBBOffsetsAfter(CPEBB, -Size);
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned. <rdar://problem/10534709>.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+
+  AdjustBBOffsetsAfter(CPEBB);
   // An island has only one predecessor BB and one successor BB. Check if
   // this BB's predecessor jumps directly to this BB's successor. This
   // shouldn't happen currently.
@@ -1390,9 +1486,9 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
                                      unsigned MaxDisp) {
   unsigned PCAdj      = isThumb ? 4 : 8;
   unsigned BrOffset   = GetOffsetOf(MI) + PCAdj;
-  unsigned DestOffset = BBOffsets[DestBB->getNumber()];
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
 
-  DEBUG(errs() << "Branch of destination BB#" << DestBB->getNumber()
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
                << " from BB#" << MI->getParent()->getNumber()
                << " max delta=" << MaxDisp
                << " from " << GetOffsetOf(MI) << " to " << DestOffset
@@ -1411,7 +1507,7 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
 
 /// FixUpImmediateBr - Fix up an immediate branch whose destination is too far
 /// away to fit in its displacement field.
-bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) {
+bool ARMConstantIslands::FixUpImmediateBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
 
@@ -1420,8 +1516,8 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) {
     return false;
 
   if (!Br.isCond)
-    return FixUpUnconditionalBr(MF, Br);
-  return FixUpConditionalBr(MF, Br);
+    return FixUpUnconditionalBr(Br);
+  return FixUpConditionalBr(Br);
 }
 
 /// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is
@@ -1429,7 +1525,7 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) {
 /// spilled in the epilogue, then we can use BL to implement a far jump.
 /// Otherwise, add an intermediate branch instruction to a branch.
 bool
-ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
+ARMConstantIslands::FixUpUnconditionalBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *MBB = MI->getParent();
   if (!isThumb1)
@@ -1438,12 +1534,12 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
   // Use BL to implement far jump.
   Br.MaxDisp = (1 << 21) * 2;
   MI->setDesc(TII->get(ARM::tBfar));
-  BBSizes[MBB->getNumber()] += 2;
-  AdjustBBOffsetsAfter(MBB, 2);
+  BBInfo[MBB->getNumber()].Size += 2;
+  AdjustBBOffsetsAfter(MBB);
   HasFarJump = true;
   ++NumUBrFixed;
 
-  DEBUG(errs() << "  Changed B to long jump " << *MI);
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
 
   return true;
 }
@@ -1452,7 +1548,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
 /// far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
 bool
-ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
+ARMConstantIslands::FixUpConditionalBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
 
@@ -1487,7 +1583,7 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
       // b   L1
       MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
       if (BBIsInRange(MI, NewDest, Br.MaxDisp)) {
-        DEBUG(errs() << "  Invert Bcc condition and swap its destination with "
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
                      << *BMI);
         BMI->getOperand(0).setMBB(DestBB);
         MI->getOperand(0).setMBB(NewDest);
@@ -1502,15 +1598,13 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
     // No need for the branch to the next block. We're adding an unconditional
     // branch to the destination.
     int delta = TII->GetInstSizeInBytes(&MBB->back());
-    BBSizes[MBB->getNumber()] -= delta;
-    MachineBasicBlock* SplitBB = llvm::next(MachineFunction::iterator(MBB));
-    AdjustBBOffsetsAfter(SplitBB, -delta);
+    BBInfo[MBB->getNumber()].Size -= delta;
     MBB->back().eraseFromParent();
-    // BBOffsets[SplitBB] is wrong temporarily, fixed below
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
   }
   MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
 
-  DEBUG(errs() << "  Insert B to BB#" << DestBB->getNumber()
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
                << " also invert condition and change dest. to BB#"
                << NextBB->getNumber() << "\n");
 
@@ -1519,23 +1613,20 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
   BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
     .addMBB(NextBB).addImm(CC).addReg(CCReg);
   Br.MI = &MBB->back();
-  BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
   if (isThumb)
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
             .addImm(ARMCC::AL).addReg(0);
   else
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
-  BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
   unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
   ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
-  BBSizes[MI->getParent()->getNumber()] -= TII->GetInstSizeInBytes(MI);
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
   MI->eraseFromParent();
-
-  // The net size change is an addition of one unconditional branch.
-  int delta = TII->GetInstSizeInBytes(&MBB->back());
-  AdjustBBOffsetsAfter(MBB, delta);
+  AdjustBBOffsetsAfter(MBB);
   return true;
 }
 
@@ -1561,7 +1652,7 @@ bool ARMConstantIslands::UndoLRSpillRestore() {
   return MadeChange;
 }
 
-bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) {
+bool ARMConstantIslands::OptimizeThumb2Instructions() {
   bool MadeChange = false;
 
   // Shrink ADR and LDR from constantpool.
@@ -1598,19 +1689,19 @@ bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) {
     if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
       U.MI->setDesc(TII->get(NewOpc));
       MachineBasicBlock *MBB = U.MI->getParent();
-      BBSizes[MBB->getNumber()] -= 2;
-      AdjustBBOffsetsAfter(MBB, -2);
+      BBInfo[MBB->getNumber()].Size -= 2;
+      AdjustBBOffsetsAfter(MBB);
       ++NumT2CPShrunk;
       MadeChange = true;
     }
   }
 
-  MadeChange |= OptimizeThumb2Branches(MF);
-  MadeChange |= OptimizeThumb2JumpTables(MF);
+  MadeChange |= OptimizeThumb2Branches();
+  MadeChange |= OptimizeThumb2JumpTables();
   return MadeChange;
 }
 
-bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
+bool ARMConstantIslands::OptimizeThumb2Branches() {
   bool MadeChange = false;
 
   for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) {
@@ -1639,8 +1730,8 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
       if (BBIsInRange(Br.MI, DestBB, MaxOffs)) {
         Br.MI->setDesc(TII->get(NewOpc));
         MachineBasicBlock *MBB = Br.MI->getParent();
-        BBSizes[MBB->getNumber()] -= 2;
-        AdjustBBOffsetsAfter(MBB, -2);
+        BBInfo[MBB->getNumber()].Size -= 2;
+        AdjustBBOffsetsAfter(MBB);
         ++NumT2BrShrunk;
         MadeChange = true;
       }
@@ -1663,7 +1754,7 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
     // Check if the distance is within 126. Subtract starting offset by 2
     // because the cmp will be eliminated.
     unsigned BrOffset = GetOffsetOf(Br.MI) + 4 - 2;
-    unsigned DestOffset = BBOffsets[DestBB->getNumber()];
+    unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
     if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
       MachineBasicBlock::iterator CmpMI = Br.MI;
       if (CmpMI != Br.MI->getParent()->begin()) {
@@ -1681,8 +1772,8 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
             CmpMI->eraseFromParent();
             Br.MI->eraseFromParent();
             Br.MI = NewBR;
-            BBSizes[MBB->getNumber()] -= 2;
-            AdjustBBOffsetsAfter(MBB, -2);
+            BBInfo[MBB->getNumber()].Size -= 2;
+            AdjustBBOffsetsAfter(MBB);
             ++NumCBZ;
             MadeChange = true;
           }
@@ -1696,12 +1787,12 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
 
 /// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
 /// jumptables when it's possible.
-bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
+bool ARMConstantIslands::OptimizeThumb2JumpTables() {
   bool MadeChange = false;
 
   // FIXME: After the tables are shrunk, can we get rid some of the
   // constantpool tables?
-  MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (MJTI == 0) return false;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
@@ -1709,7 +1800,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
     MachineInstr *MI = T2JumpTables[i];
     const MCInstrDesc &MCID = MI->getDesc();
     unsigned NumOps = MCID.getNumOperands();
-    unsigned JTOpIdx = NumOps - (MCID.isPredicable() ? 3 : 2);
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 3 : 2);
     MachineOperand JTOP = MI->getOperand(JTOpIdx);
     unsigned JTI = JTOP.getIndex();
     assert(JTI < JT.size());
@@ -1720,7 +1811,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
     for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
       MachineBasicBlock *MBB = JTBBs[j];
-      unsigned DstOffset = BBOffsets[MBB->getNumber()];
+      unsigned DstOffset = BBInfo[MBB->getNumber()].Offset;
       // Negative offset is not ok. FIXME: We should change BB layout to make
       // sure all the branches are forward.
       if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
@@ -1808,8 +1899,8 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
       MI->eraseFromParent();
 
       int delta = OrigSize - NewSize;
-      BBSizes[MBB->getNumber()] -= delta;
-      AdjustBBOffsetsAfter(MBB, -delta);
+      BBInfo[MBB->getNumber()].Size -= delta;
+      AdjustBBOffsetsAfter(MBB);
 
       ++NumTBs;
       MadeChange = true;
@@ -1821,10 +1912,10 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
 
 /// ReorderThumb2JumpTables - Adjust the function's block layout to ensure that
 /// jump tables always branch forwards, since that's what tbb and tbh need.
-bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) {
+bool ARMConstantIslands::ReorderThumb2JumpTables() {
   bool MadeChange = false;
 
-  MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (MJTI == 0) return false;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
@@ -1832,7 +1923,7 @@ bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) {
     MachineInstr *MI = T2JumpTables[i];
     const MCInstrDesc &MCID = MI->getDesc();
     unsigned NumOps = MCID.getNumOperands();
-    unsigned JTOpIdx = NumOps - (MCID.isPredicable() ? 3 : 2);
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 3 : 2);
     MachineOperand JTOP = MI->getOperand(JTOpIdx);
     unsigned JTI = JTOP.getIndex();
     assert(JTI < JT.size());
@@ -1864,8 +1955,6 @@ bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) {
 MachineBasicBlock *ARMConstantIslands::
 AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
 {
-  MachineFunction &MF = *BB->getParent();
-
   // If the destination block is terminated by an unconditional branch,
   // try to move it; otherwise, create a new block following the jump
   // table that branches back to the actual target. This is a very simple
@@ -1882,22 +1971,22 @@ AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
   // If the block ends in an unconditional branch, move it. The prior block
   // has to have an analyzable terminator for us to move this one. Be paranoid
   // and make sure we're not trying to move the entry block of the function.
-  if (!B && Cond.empty() && BB != MF.begin() &&
+  if (!B && Cond.empty() && BB != MF->begin() &&
       !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
     BB->moveAfter(JTBB);
     OldPrior->updateTerminator();
     BB->updateTerminator();
     // Update numbering to account for the block being moved.
-    MF.RenumberBlocks();
+    MF->RenumberBlocks();
     ++NumJTMoved;
     return NULL;
   }
 
   // Create a new MBB for the code after the jump BB.
   MachineBasicBlock *NewBB =
-    MF.CreateMachineBasicBlock(JTBB->getBasicBlock());
+    MF->CreateMachineBasicBlock(JTBB->getBasicBlock());
   MachineFunction::iterator MBBI = JTBB; ++MBBI;
-  MF.insert(MBBI, NewBB);
+  MF->insert(MBBI, NewBB);
 
   // Add an unconditional branch from NewBB to BB.
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
@@ -1907,7 +1996,7 @@ AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
           .addImm(ARMCC::AL).addReg(0);
 
   // Update internal data structures to account for the newly inserted MBB.
-  MF.RenumberBlocks(NewBB);
+  MF->RenumberBlocks(NewBB);
 
   // Update the CFG.
   NewBB->addSuccessor(BB);
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index fc464ea..01d772d 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -61,7 +61,7 @@ namespace {
     void ExpandVST(MachineBasicBlock::iterator &MBBI);
     void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
     void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
-                    unsigned Opc, bool IsExt, unsigned NumRegs);
+                    unsigned Opc, bool IsExt);
     void ExpandMOV32BitImm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI);
   };
@@ -129,12 +129,15 @@ namespace {
 }
 
 static const NEONLdStTableEntry NEONLdStTable[] = {
-{ ARM::VLD1DUPq16Pseudo,     ARM::VLD1DUPq16,     true, false, false, SingleSpc, 2, 4,true},
-{ ARM::VLD1DUPq16Pseudo_UPD, ARM::VLD1DUPq16_UPD, true, true, true,  SingleSpc, 2, 4,true},
-{ ARM::VLD1DUPq32Pseudo,     ARM::VLD1DUPq32,     true, false, false, SingleSpc, 2, 2,true},
-{ ARM::VLD1DUPq32Pseudo_UPD, ARM::VLD1DUPq32_UPD, true, true, true,  SingleSpc, 2, 2,true},
-{ ARM::VLD1DUPq8Pseudo,      ARM::VLD1DUPq8,      true, false, false, SingleSpc, 2, 8,true},
-{ ARM::VLD1DUPq8Pseudo_UPD,  ARM::VLD1DUPq8_UPD, true, true, true,  SingleSpc, 2, 8,true},
+{ ARM::VLD1DUPq16Pseudo,     ARM::VLD1DUPq16,     true, false, false, SingleSpc, 2, 4,false},
+{ ARM::VLD1DUPq16PseudoWB_fixed, ARM::VLD1DUPq16wb_fixed, true, true, true,  SingleSpc, 2, 4,false},
+{ ARM::VLD1DUPq16PseudoWB_register, ARM::VLD1DUPq16wb_register, true, true, true,  SingleSpc, 2, 4,false},
+{ ARM::VLD1DUPq32Pseudo,     ARM::VLD1DUPq32,     true, false, false, SingleSpc, 2, 2,false},
+{ ARM::VLD1DUPq32PseudoWB_fixed, ARM::VLD1DUPq32wb_fixed, true, true, false,  SingleSpc, 2, 2,false},
+{ ARM::VLD1DUPq32PseudoWB_register, ARM::VLD1DUPq32wb_register, true, true, true,  SingleSpc, 2, 2,false},
+{ ARM::VLD1DUPq8Pseudo,      ARM::VLD1DUPq8,      true, false, false, SingleSpc, 2, 8,false},
+{ ARM::VLD1DUPq8PseudoWB_fixed,  ARM::VLD1DUPq8wb_fixed, true, true, false,  SingleSpc, 2, 8,false},
+{ ARM::VLD1DUPq8PseudoWB_register,  ARM::VLD1DUPq8wb_register, true, true, true,  SingleSpc, 2, 8,false},
 
 { ARM::VLD1LNq16Pseudo,     ARM::VLD1LNd16,     true, false, false, EvenDblSpc, 1, 4 ,true},
 { ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true,  EvenDblSpc, 1, 4 ,true},
@@ -177,18 +180,24 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true,  EvenDblSpc, 2, 2 ,true},
 
 { ARM::VLD2d16Pseudo,       ARM::VLD2d16,      true,  false, false, SingleSpc,  2, 4 ,false},
-{ ARM::VLD2d16Pseudo_UPD,   ARM::VLD2d16_UPD, true, true, true,  SingleSpc,  2, 4 ,false},
+{ ARM::VLD2d16PseudoWB_fixed,   ARM::VLD2d16wb_fixed, true, true, false,  SingleSpc,  2, 4 ,false},
+{ ARM::VLD2d16PseudoWB_register,   ARM::VLD2d16wb_register, true, true, true,  SingleSpc,  2, 4 ,false},
 { ARM::VLD2d32Pseudo,       ARM::VLD2d32,      true,  false, false, SingleSpc,  2, 2 ,false},
-{ ARM::VLD2d32Pseudo_UPD,   ARM::VLD2d32_UPD, true, true, true,  SingleSpc,  2, 2 ,false},
+{ ARM::VLD2d32PseudoWB_fixed,   ARM::VLD2d32wb_fixed, true, true, false,  SingleSpc,  2, 2 ,false},
+{ ARM::VLD2d32PseudoWB_register,   ARM::VLD2d32wb_register, true, true, true,  SingleSpc,  2, 2 ,false},
 { ARM::VLD2d8Pseudo,        ARM::VLD2d8,       true,  false, false, SingleSpc,  2, 8 ,false},
-{ ARM::VLD2d8Pseudo_UPD,    ARM::VLD2d8_UPD, true, true, true,  SingleSpc,  2, 8 ,false},
+{ ARM::VLD2d8PseudoWB_fixed,    ARM::VLD2d8wb_fixed, true, true, false,  SingleSpc,  2, 8 ,false},
+{ ARM::VLD2d8PseudoWB_register,    ARM::VLD2d8wb_register, true, true, true,  SingleSpc,  2, 8 ,false},
 
 { ARM::VLD2q16Pseudo,       ARM::VLD2q16,      true,  false, false, SingleSpc,  4, 4 ,false},
-{ ARM::VLD2q16Pseudo_UPD,   ARM::VLD2q16_UPD, true, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_fixed,   ARM::VLD2q16wb_fixed, true, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_register,   ARM::VLD2q16wb_register, true, true, true,  SingleSpc,  4, 4 ,false},
 { ARM::VLD2q32Pseudo,       ARM::VLD2q32,      true,  false, false, SingleSpc,  4, 2 ,false},
-{ ARM::VLD2q32Pseudo_UPD,   ARM::VLD2q32_UPD, true, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_fixed,   ARM::VLD2q32wb_fixed, true, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_register,   ARM::VLD2q32wb_register, true, true, true,  SingleSpc,  4, 2 ,false},
 { ARM::VLD2q8Pseudo,        ARM::VLD2q8,       true,  false, false, SingleSpc,  4, 8 ,false},
-{ ARM::VLD2q8Pseudo_UPD,    ARM::VLD2q8_UPD, true, true, true,  SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_fixed,    ARM::VLD2q8wb_fixed, true, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_register,    ARM::VLD2q8wb_register, true, true, true,  SingleSpc,  4, 8 ,false},
 
 { ARM::VLD3DUPd16Pseudo,     ARM::VLD3DUPd16,     true, false, false, SingleSpc, 3, 4,true},
 { ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true,  SingleSpc, 3, 4,true},
@@ -267,10 +276,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VST1LNq8Pseudo,      ARM::VST1LNd8,     false, false, false, EvenDblSpc, 1, 8 ,true},
 { ARM::VST1LNq8Pseudo_UPD,  ARM::VST1LNd8_UPD, false, true, true,  EvenDblSpc, 1, 8 ,true},
 
-{ ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, false, SingleSpc,  4, 1 ,true},
-{ ARM::VST1d64QPseudo_UPD,  ARM::VST1d64Q_UPD, false, true, true,  SingleSpc,  4, 1 ,true},
-{ ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, false, SingleSpc,  3, 1 ,true},
-{ ARM::VST1d64TPseudo_UPD,  ARM::VST1d64T_UPD, false, true, true,  SingleSpc,  3, 1 ,true},
+{ ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_fixed,  ARM::VST1d64Qwb_fixed, false, true, false,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, false, SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_fixed,  ARM::VST1d64Twb_fixed, false, true, false,  SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_register,  ARM::VST1d64Twb_register, false, true, true,  SingleSpc,  3, 1 ,false},
 
 { ARM::VST1q16Pseudo,       ARM::VST1q16,      false, false, false, SingleSpc,  2, 4 ,false},
 { ARM::VST1q16PseudoWB_fixed,   ARM::VST1q16wb_fixed, false, true, false,  SingleSpc,  2, 4 ,false},
@@ -296,19 +307,25 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VST2LNq32Pseudo,     ARM::VST2LNq32,     false, false, false, EvenDblSpc, 2, 2,true},
 { ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true,  EvenDblSpc, 2, 2,true},
 
-{ ARM::VST2d16Pseudo,       ARM::VST2d16,      false, false, false, SingleSpc,  2, 4 ,true},
-{ ARM::VST2d16Pseudo_UPD,   ARM::VST2d16_UPD, false, true, true,  SingleSpc,  2, 4 ,true},
-{ ARM::VST2d32Pseudo,       ARM::VST2d32,      false, false, false, SingleSpc,  2, 2 ,true},
-{ ARM::VST2d32Pseudo_UPD,   ARM::VST2d32_UPD, false, true, true,  SingleSpc,  2, 2 ,true},
-{ ARM::VST2d8Pseudo,        ARM::VST2d8,       false, false, false, SingleSpc,  2, 8 ,true},
-{ ARM::VST2d8Pseudo_UPD,    ARM::VST2d8_UPD, false, true, true,  SingleSpc,  2, 8 ,true},
-
-{ ARM::VST2q16Pseudo,       ARM::VST2q16,      false, false, false, SingleSpc,  4, 4 ,true},
-{ ARM::VST2q16Pseudo_UPD,   ARM::VST2q16_UPD, false, true, true,  SingleSpc,  4, 4 ,true},
-{ ARM::VST2q32Pseudo,       ARM::VST2q32,      false, false, false, SingleSpc,  4, 2 ,true},
-{ ARM::VST2q32Pseudo_UPD,   ARM::VST2q32_UPD, false, true, true,  SingleSpc,  4, 2 ,true},
-{ ARM::VST2q8Pseudo,        ARM::VST2q8,       false, false, false, SingleSpc,  4, 8 ,true},
-{ ARM::VST2q8Pseudo_UPD,    ARM::VST2q8_UPD, false, true, true,  SingleSpc,  4, 8 ,true},
+{ ARM::VST2d16Pseudo,       ARM::VST2d16,      false, false, false, SingleSpc,  2, 4 ,false},
+{ ARM::VST2d16PseudoWB_fixed,   ARM::VST2d16wb_fixed, false, true, false,  SingleSpc,  2, 4 ,false},
+{ ARM::VST2d16PseudoWB_register,   ARM::VST2d16wb_register, false, true, true,  SingleSpc,  2, 4 ,false},
+{ ARM::VST2d32Pseudo,       ARM::VST2d32,      false, false, false, SingleSpc,  2, 2 ,false},
+{ ARM::VST2d32PseudoWB_fixed,   ARM::VST2d32wb_fixed, false, true, true,  SingleSpc,  2, 2 ,false},
+{ ARM::VST2d32PseudoWB_register,   ARM::VST2d32wb_register, false, true, true,  SingleSpc,  2, 2 ,false},
+{ ARM::VST2d8Pseudo,        ARM::VST2d8,       false, false, false, SingleSpc,  2, 8 ,false},
+{ ARM::VST2d8PseudoWB_fixed,    ARM::VST2d8wb_fixed, false, true, false,  SingleSpc,  2, 8 ,false},
+{ ARM::VST2d8PseudoWB_register,    ARM::VST2d8wb_register, false, true, true,  SingleSpc,  2, 8 ,false},
+
+{ ARM::VST2q16Pseudo,       ARM::VST2q16,      false, false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_fixed,   ARM::VST2q16wb_fixed, false, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_register,   ARM::VST2q16wb_register, false, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q32Pseudo,       ARM::VST2q32,      false, false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_fixed,   ARM::VST2q32wb_fixed, false, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_register,   ARM::VST2q32wb_register, false, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q8Pseudo,        ARM::VST2q8,       false, false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_fixed,    ARM::VST2q8wb_fixed, false, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_register,    ARM::VST2q8wb_register, false, true, true,  SingleSpc,  4, 8 ,false},
 
 { ARM::VST3LNd16Pseudo,     ARM::VST3LNd16,     false, false, false, SingleSpc, 3, 4 ,true},
 { ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true,  SingleSpc, 3, 4 ,true},
@@ -620,7 +637,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
 /// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ
 /// register operands to real instructions with D register operands.
 void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
-                                 unsigned Opc, bool IsExt, unsigned NumRegs) {
+                                 unsigned Opc, bool IsExt) {
   MachineInstr &MI = *MBBI;
   MachineBasicBlock &MBB = *MI.getParent();
 
@@ -636,11 +653,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
   unsigned D0, D1, D2, D3;
   GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
-  MIB.addReg(D0).addReg(D1);
-  if (NumRegs > 2)
-    MIB.addReg(D2);
-  if (NumRegs > 3)
-    MIB.addReg(D3);
+  MIB.addReg(D0);
 
   // Copy the other source register operand.
   MIB.addOperand(MI.getOperand(OpIdx++));
@@ -1090,12 +1103,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD2q8Pseudo:
     case ARM::VLD2q16Pseudo:
     case ARM::VLD2q32Pseudo:
-    case ARM::VLD2d8Pseudo_UPD:
-    case ARM::VLD2d16Pseudo_UPD:
-    case ARM::VLD2d32Pseudo_UPD:
-    case ARM::VLD2q8Pseudo_UPD:
-    case ARM::VLD2q16Pseudo_UPD:
-    case ARM::VLD2q32Pseudo_UPD:
+    case ARM::VLD2d8PseudoWB_fixed:
+    case ARM::VLD2d16PseudoWB_fixed:
+    case ARM::VLD2d32PseudoWB_fixed:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2d8PseudoWB_register:
+    case ARM::VLD2d16PseudoWB_register:
+    case ARM::VLD2d32PseudoWB_register:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
     case ARM::VLD3d8Pseudo:
     case ARM::VLD3d16Pseudo:
     case ARM::VLD3d32Pseudo:
@@ -1131,9 +1150,12 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD1DUPq8Pseudo:
     case ARM::VLD1DUPq16Pseudo:
     case ARM::VLD1DUPq32Pseudo:
-    case ARM::VLD1DUPq8Pseudo_UPD:
-    case ARM::VLD1DUPq16Pseudo_UPD:
-    case ARM::VLD1DUPq32Pseudo_UPD:
+    case ARM::VLD1DUPq8PseudoWB_fixed:
+    case ARM::VLD1DUPq16PseudoWB_fixed:
+    case ARM::VLD1DUPq32PseudoWB_fixed:
+    case ARM::VLD1DUPq8PseudoWB_register:
+    case ARM::VLD1DUPq16PseudoWB_register:
+    case ARM::VLD1DUPq32PseudoWB_register:
     case ARM::VLD2DUPd8Pseudo:
     case ARM::VLD2DUPd16Pseudo:
     case ARM::VLD2DUPd32Pseudo:
@@ -1173,12 +1195,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VST2q8Pseudo:
     case ARM::VST2q16Pseudo:
     case ARM::VST2q32Pseudo:
-    case ARM::VST2d8Pseudo_UPD:
-    case ARM::VST2d16Pseudo_UPD:
-    case ARM::VST2d32Pseudo_UPD:
-    case ARM::VST2q8Pseudo_UPD:
-    case ARM::VST2q16Pseudo_UPD:
-    case ARM::VST2q32Pseudo_UPD:
+    case ARM::VST2d8PseudoWB_fixed:
+    case ARM::VST2d16PseudoWB_fixed:
+    case ARM::VST2d32PseudoWB_fixed:
+    case ARM::VST2q8PseudoWB_fixed:
+    case ARM::VST2q16PseudoWB_fixed:
+    case ARM::VST2q32PseudoWB_fixed:
+    case ARM::VST2d8PseudoWB_register:
+    case ARM::VST2d16PseudoWB_register:
+    case ARM::VST2d32PseudoWB_register:
+    case ARM::VST2q8PseudoWB_register:
+    case ARM::VST2q16PseudoWB_register:
+    case ARM::VST2q32PseudoWB_register:
     case ARM::VST3d8Pseudo:
     case ARM::VST3d16Pseudo:
     case ARM::VST3d32Pseudo:
@@ -1186,7 +1214,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VST3d8Pseudo_UPD:
     case ARM::VST3d16Pseudo_UPD:
     case ARM::VST3d32Pseudo_UPD:
-    case ARM::VST1d64TPseudo_UPD:
+    case ARM::VST1d64TPseudoWB_fixed:
+    case ARM::VST1d64TPseudoWB_register:
     case ARM::VST3q8Pseudo_UPD:
     case ARM::VST3q16Pseudo_UPD:
     case ARM::VST3q32Pseudo_UPD:
@@ -1203,7 +1232,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VST4d8Pseudo_UPD:
     case ARM::VST4d16Pseudo_UPD:
     case ARM::VST4d32Pseudo_UPD:
-    case ARM::VST1d64QPseudo_UPD:
+    case ARM::VST1d64QPseudoWB_fixed:
+    case ARM::VST1d64QPseudoWB_register:
     case ARM::VST4q8Pseudo_UPD:
     case ARM::VST4q16Pseudo_UPD:
     case ARM::VST4q32Pseudo_UPD:
@@ -1291,12 +1321,12 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       ExpandLaneOp(MBBI);
       return true;
 
-    case ARM::VTBL2Pseudo: ExpandVTBL(MBBI, ARM::VTBL2, false, 2); return true;
-    case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false, 3); return true;
-    case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false, 4); return true;
-    case ARM::VTBX2Pseudo: ExpandVTBL(MBBI, ARM::VTBX2, true, 2); return true;
-    case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true, 3); return true;
-    case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true, 4); return true;
+    case ARM::VTBL2Pseudo: ExpandVTBL(MBBI, ARM::VTBL2, false); return true;
+    case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
+    case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
+    case ARM::VTBX2Pseudo: ExpandVTBL(MBBI, ARM::VTBX2, true); return true;
+    case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
+    case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
   }
 
   return false;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 9bae422..a98dfc3 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -178,10 +178,12 @@ class ARMFastISel : public FastISel {
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
                     bool isZExt);
-    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, bool isZExt,
-                     bool allocReg);
+    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
+                     unsigned Alignment = 0, bool isZExt = true,
+                     bool allocReg = true);
                      
-    bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr);
+    bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
+                      unsigned Alignment = 0);
     bool ARMComputeAddress(const Value *Obj, Address &Addr);
     void ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3);
     bool ARMIsMemCpySmall(uint64_t Len);
@@ -227,8 +229,7 @@ class ARMFastISel : public FastISel {
 // we don't care about implicit defs here, just places we'll need to add a
 // default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
 bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.hasOptionalDef())
+  if (!MI->hasOptionalDef())
     return false;
 
   // Look to see if our OptionalDef is defining CPSR or CCR.
@@ -702,7 +703,7 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
     TargetRegisterClass* RC = TLI.getRegClassFor(VT);
     unsigned ResultReg = createResultReg(RC);
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(Opc), ResultReg)
                             .addFrameIndex(SI->second)
                             .addImm(0));
@@ -898,7 +899,7 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3) {
                               ARM::GPRRegisterClass;
     unsigned ResultReg = createResultReg(RC);
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(Opc), ResultReg)
                             .addFrameIndex(Addr.Base.FI)
                             .addImm(0));
@@ -937,7 +938,8 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
     // Now add the rest of the operands.
     MIB.addFrameIndex(FI);
 
-    // ARM halfword load/stores and signed byte loads need an additional operand.
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
     if (useAM3) {
       signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
       MIB.addReg(0);
@@ -950,7 +952,8 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
     // Now add the rest of the operands.
     MIB.addReg(Addr.Base.Reg);
 
-    // ARM halfword load/stores and signed byte loads need an additional operand.
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
     if (useAM3) {
       signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
       MIB.addReg(0);
@@ -963,10 +966,11 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
 }
 
 bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
-                              bool isZExt = true, bool allocReg = true) {
+                              unsigned Alignment, bool isZExt, bool allocReg) {
   assert(VT.isSimple() && "Non-simple types are invalid here!");
   unsigned Opc;
   bool useAM3 = false;
+  bool needVMOV = false;
   TargetRegisterClass *RC;  
   switch (VT.getSimpleVT().SimpleTy) {
     // This is mostly going to be Neon/vector support.
@@ -1012,10 +1016,25 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
       RC = ARM::GPRRegisterClass;
       break;
     case MVT::f32:
-      Opc = ARM::VLDRS;
-      RC = TLI.getRegClassFor(VT);
+      if (!Subtarget->hasVFP2()) return false;
+      // Unaligned loads need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        needVMOV = true;
+        VT = MVT::i32;
+        Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+        RC = ARM::GPRRegisterClass;
+      } else {
+        Opc = ARM::VLDRS;
+        RC = TLI.getRegClassFor(VT);
+      }
       break;
     case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned loads need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+        return false;
+
       Opc = ARM::VLDRD;
       RC = TLI.getRegClassFor(VT);
       break;
@@ -1030,6 +1049,16 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(Opc), ResultReg);
   AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
+
+  // If we had an unaligned load of a float we've converted it to an regular
+  // load.  Now we must move from the GRP to the FP register.
+  if (needVMOV) {
+    unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::VMOVSR), MoveReg)
+                    .addReg(ResultReg));
+    ResultReg = MoveReg;
+  }
   return true;
 }
 
@@ -1048,12 +1077,14 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
   if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
 
   unsigned ResultReg;
-  if (!ARMEmitLoad(VT, ResultReg, Addr)) return false;
+  if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
+bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
+                               unsigned Alignment) {
   unsigned StrOpc;
   bool useAM3 = false;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -1101,10 +1132,26 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
       break;
     case MVT::f32:
       if (!Subtarget->hasVFP2()) return false;
-      StrOpc = ARM::VSTRS;
+      // Unaligned stores need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(ARM::VMOVRS), MoveReg)
+                        .addReg(SrcReg));
+        SrcReg = MoveReg;
+        VT = MVT::i32;
+        StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12;
+      } else {
+        StrOpc = ARM::VSTRS;
+      }
       break;
     case MVT::f64:
       if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned stores need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+          return false;
+
       StrOpc = ARM::VSTRD;
       break;
   }
@@ -1141,7 +1188,8 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
   if (!ARMComputeAddress(I->getOperand(1), Addr))
     return false;
 
-  if (!ARMEmitStore(VT, SrcReg, Addr)) return false;
+  if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
   return true;
 }
 
@@ -1360,7 +1408,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
   unsigned SrcReg1 = getRegForValue(Src1Value);
   if (SrcReg1 == 0) return false;
 
-  unsigned SrcReg2;
+  unsigned SrcReg2 = 0;
   if (!UseImm) {
     SrcReg2 = getRegForValue(Src2Value);
     if (SrcReg2 == 0) return false;
@@ -1577,7 +1625,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
       (ARM_AM::getSOImmVal(Imm) != -1);
   }
 
-  unsigned Op2Reg;
+  unsigned Op2Reg = 0;
   if (!UseImm) {
     Op2Reg = getRegForValue(I->getOperand(2));
     if (Op2Reg == 0) return false;
@@ -1716,7 +1764,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
     // Use target triple & subtarget features to do actual dispatch.
     if (Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() &&
-          FloatABIType == FloatABI::Hard)
+          TM.Options.FloatABIType == FloatABI::Hard)
         return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
       else
         return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
@@ -1765,21 +1813,23 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
     switch (VA.getLocInfo()) {
       case CCValAssign::Full: break;
       case CCValAssign::SExt: {
-        EVT DestVT = VA.getLocVT();
+        MVT DestVT = VA.getLocVT();
         unsigned ResultReg = ARMEmitIntExt(ArgVT, Arg, DestVT,
                                            /*isZExt*/false);
         assert (ResultReg != 0 && "Failed to emit a sext");
         Arg = ResultReg;
+        ArgVT = DestVT;
         break;
       }
       case CCValAssign::AExt:
         // Intentional fall-through.  Handle AExt and ZExt.
       case CCValAssign::ZExt: {
-        EVT DestVT = VA.getLocVT();
+        MVT DestVT = VA.getLocVT();
         unsigned ResultReg = ARMEmitIntExt(ArgVT, Arg, DestVT,
                                            /*isZExt*/true);
         assert (ResultReg != 0 && "Failed to emit a sext");
         Arg = ResultReg;
+        ArgVT = DestVT;
         break;
       }
       case CCValAssign::BCvt: {
@@ -2456,7 +2506,7 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
   if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
   
   unsigned ResultReg = MI->getOperand(0).getReg();
-  if (!ARMEmitLoad(VT, ResultReg, Addr, isZExt, false))
+  if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
     return false;
   MI->eraseFromParent();
   return true;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 2d1de6f..06944b1 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -37,7 +37,8 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   // Always eliminate non-leaf frame pointers.
-  return ((DisableFramePointerElim(MF) && MFI->hasCalls()) ||
+  return ((MF.getTarget().Options.DisableFramePointerElim(MF) &&
+           MFI->hasCalls()) ||
           RegInfo->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken());
@@ -309,8 +310,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
 void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI->getDesc().isReturn() &&
-         "Can only insert epilog into returning blocks");
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 787f6a2..a5fd15b 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -21,7 +21,7 @@ static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
   // FIXME: Detect integer instructions properly.
   const MCInstrDesc &MCID = MI->getDesc();
   unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
-  if (MCID.mayStore())
+  if (MI->mayStore())
     return false;
   unsigned Opcode = MCID.getOpcode();
   if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
@@ -38,9 +38,6 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   MachineInstr *MI = SU->getInstr();
 
   if (!MI->isDebugValue()) {
-    if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1])
-      return Hazard;
-
     // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
     // a VMLA / VMLS will cause 4 cycle stall.
     const MCInstrDesc &MCID = MI->getDesc();
@@ -48,9 +45,9 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       MachineInstr *DefMI = LastMI;
       const MCInstrDesc &LastMCID = LastMI->getDesc();
       // Skip over one non-VFP / NEON instruction.
-      if (!LastMCID.isBarrier() &&
+      if (!LastMI->isBarrier() &&
           // On A9, AGU and NEON/FPU are muxed.
-          !(STI.isCortexA9() && (LastMCID.mayLoad() || LastMCID.mayStore())) &&
+          !(STI.isCortexA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
@@ -76,30 +73,11 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 void ARMHazardRecognizer::Reset() {
   LastMI = 0;
   FpMLxStalls = 0;
-  ITBlockSize = 0;
   ScoreboardHazardRecognizer::Reset();
 }
 
 void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
   MachineInstr *MI = SU->getInstr();
-  unsigned Opcode = MI->getOpcode();
-  if (ITBlockSize) {
-    --ITBlockSize;
-  } else if (Opcode == ARM::t2IT) {
-    unsigned Mask = MI->getOperand(1).getImm();
-    unsigned NumTZ = CountTrailingZeros_32(Mask);
-    assert(NumTZ <= 3 && "Invalid IT mask!");
-    ITBlockSize = 4 - NumTZ;
-    MachineBasicBlock::iterator I = MI;
-    for (unsigned i = 0; i < ITBlockSize; ++i) {
-      // Advance to the next instruction, skipping any dbg_value instructions.
-      do {
-        ++I;
-      } while (I->isDebugValue());
-      ITBlockMIs[ITBlockSize-1-i] = &*I;
-    }
-  }
-
   if (!MI->isDebugValue()) {
     LastMI = MI;
     FpMLxStalls = 0;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index 2bc218d..98bfc4c 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h
@@ -23,6 +23,10 @@ class ARMBaseRegisterInfo;
 class ARMSubtarget;
 class MachineInstr;
 
+/// ARMHazardRecognizer handles special constraints that are not expressed in
+/// the scheduling itinerary. This is only used during postRA scheduling. The
+/// ARM preRA scheduler uses an unspecialized instance of the
+/// ScoreboardHazardRecognizer.
 class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
   const ARMBaseInstrInfo &TII;
   const ARMBaseRegisterInfo &TRI;
@@ -30,8 +34,6 @@ class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
 
   MachineInstr *LastMI;
   unsigned FpMLxStalls;
-  unsigned ITBlockSize;  // No. of MIs in current IT block yet to be scheduled.
-  MachineInstr *ITBlockMIs[4];
 
 public:
   ARMHazardRecognizer(const InstrItineraryData *ItinData,
@@ -40,7 +42,7 @@ public:
                       const ARMSubtarget &sti,
                       const ScheduleDAG *DAG) :
     ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii),
-    TRI(tri), STI(sti), LastMI(0), ITBlockSize(0) {}
+    TRI(tri), STI(sti), LastMI(0) {}
 
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void Reset();
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index bc8588f..7473141 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1579,6 +1579,22 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
   case ARM::VST1q16PseudoWB_fixed: return ARM::VST1q16PseudoWB_register;
   case ARM::VST1q32PseudoWB_fixed: return ARM::VST1q32PseudoWB_register;
   case ARM::VST1q64PseudoWB_fixed: return ARM::VST1q64PseudoWB_register;
+  case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register;
+  case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register;
+
+  case ARM::VLD2d8PseudoWB_fixed: return ARM::VLD2d8PseudoWB_register;
+  case ARM::VLD2d16PseudoWB_fixed: return ARM::VLD2d16PseudoWB_register;
+  case ARM::VLD2d32PseudoWB_fixed: return ARM::VLD2d32PseudoWB_register;
+  case ARM::VLD2q8PseudoWB_fixed: return ARM::VLD2q8PseudoWB_register;
+  case ARM::VLD2q16PseudoWB_fixed: return ARM::VLD2q16PseudoWB_register;
+  case ARM::VLD2q32PseudoWB_fixed: return ARM::VLD2q32PseudoWB_register;
+
+  case ARM::VST2d8PseudoWB_fixed: return ARM::VST2d8PseudoWB_register;
+  case ARM::VST2d16PseudoWB_fixed: return ARM::VST2d16PseudoWB_register;
+  case ARM::VST2d32PseudoWB_fixed: return ARM::VST2d32PseudoWB_register;
+  case ARM::VST2q8PseudoWB_fixed: return ARM::VST2q8PseudoWB_register;
+  case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register;
+  case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register;
   }
   return Opc; // If not one we handle, return it unchanged.
 }
@@ -1646,13 +1662,13 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VLD1 fixed increment doesn't need Reg0. Remove the reg0
+      // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
       // case entirely when the rest are updated to that form, too.
-      if (NumVecs == 1 && !isa<ConstantSDNode>(Inc.getNode()))
+      if ((NumVecs == 1 || NumVecs == 2) && !isa<ConstantSDNode>(Inc.getNode()))
         Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
+      // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
       // check for that explicitly too. Horribly hacky, but temporary.
-      if ((NumVecs != 1 && Opc != ARM::VLD1q64PseudoWB_fixed) ||
+      if ((NumVecs != 1 && NumVecs != 2 && Opc != ARM::VLD1q64PseudoWB_fixed) ||
           !isa<ConstantSDNode>(Inc.getNode()))
         Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
     }
@@ -1796,9 +1812,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VST1 fixed increment doesn't need Reg0. Remove the reg0
+      // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
       // case entirely when the rest are updated to that form, too.
-      if (NumVecs == 1 && !isa<ConstantSDNode>(Inc.getNode()))
+      if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
         Opc = getVLDSTRegisterUpdateOpcode(Opc);
       // We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
       // check for that explicitly too. Horribly hacky, but temporary.
@@ -2810,10 +2826,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VLD2_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD2d8Pseudo_UPD, ARM::VLD2d16Pseudo_UPD,
-                            ARM::VLD2d32Pseudo_UPD, ARM::VLD1q64PseudoWB_fixed};
-    unsigned QOpcodes[] = { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q16Pseudo_UPD,
-                            ARM::VLD2q32Pseudo_UPD };
+    unsigned DOpcodes[] = { ARM::VLD2d8PseudoWB_fixed,
+                            ARM::VLD2d16PseudoWB_fixed,
+                            ARM::VLD2d32PseudoWB_fixed,
+                            ARM::VLD1q64PseudoWB_fixed};
+    unsigned QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
+                            ARM::VLD2q16PseudoWB_fixed,
+                            ARM::VLD2q32PseudoWB_fixed };
     return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
   }
 
@@ -2876,16 +2895,19 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VST2_UPD: {
-    unsigned DOpcodes[] = { ARM::VST2d8Pseudo_UPD, ARM::VST2d16Pseudo_UPD,
-                            ARM::VST2d32Pseudo_UPD, ARM::VST1q64PseudoWB_fixed};
-    unsigned QOpcodes[] = { ARM::VST2q8Pseudo_UPD, ARM::VST2q16Pseudo_UPD,
-                            ARM::VST2q32Pseudo_UPD };
+    unsigned DOpcodes[] = { ARM::VST2d8PseudoWB_fixed,
+                            ARM::VST2d16PseudoWB_fixed,
+                            ARM::VST2d32PseudoWB_fixed,
+                            ARM::VST1q64PseudoWB_fixed};
+    unsigned QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
+                            ARM::VST2q16PseudoWB_fixed,
+                            ARM::VST2q32PseudoWB_fixed };
     return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VST3_UPD: {
     unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD,
-                            ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudo_UPD };
+                            ARM::VST3d32Pseudo_UPD,ARM::VST1d64TPseudoWB_fixed};
     unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
                              ARM::VST3q16Pseudo_UPD,
                              ARM::VST3q32Pseudo_UPD };
@@ -2897,7 +2919,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   case ARMISD::VST4_UPD: {
     unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD,
-                            ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudo_UPD };
+                            ARM::VST4d32Pseudo_UPD,ARM::VST1d64QPseudoWB_fixed};
     unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
                              ARM::VST4q16Pseudo_UPD,
                              ARM::VST4q32Pseudo_UPD };
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 8c4c06f..c6c1f5b 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -72,7 +72,7 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
-namespace llvm {
+namespace {
   class ARMCCState : public CCState {
   public:
     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
@@ -432,7 +432,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
   else
     addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
     addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
     if (!Subtarget->isFPOnlySP())
       addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
@@ -467,13 +468,23 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
     // neither Neon nor VFP support any arithmetic operations on it.
+    // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
+    // supported for v4f32.
     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    // FIXME: Code duplication: FDIV and FREM are expanded always, see
+    // ARMTargetLowering::addTypeForNEON method for details.
     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+    // FIXME: Create unittest.
+    // In another words, find a way when "copysign" appears in DAG with vector
+    // operands.
     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+    // FIXME: Code duplication: SETCC has custom operation action, see
+    // ARMTargetLowering::addTypeForNEON method for details.
     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FNEG and for FABS.
     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
@@ -486,11 +497,23 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+    
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
 
     // Neon does not support some operations on v1i64 and v2i64 types.
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
@@ -586,6 +609,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
 
+  // These just redirect to CTTZ and CTLZ on ARM.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
+
   // Only ARMv6 has BSWAP.
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -674,7 +701,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
     // iff target supports vfp2.
     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
@@ -712,7 +740,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
   setOperationAction(ISD::FREM,      MVT::f64, Expand);
   setOperationAction(ISD::FREM,      MVT::f32, Expand);
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
   }
@@ -723,7 +752,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
   // Various VFP goodness
-  if (!UseSoftFloat && !Subtarget->isThumb1Only()) {
+  if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
     // int <-> fp are custom expanded into bit_convert + ARMISD ops.
     if (Subtarget->hasVFP2()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
@@ -751,7 +780,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
-  if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2())
+  if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() ||
+      !Subtarget->hasVFP2())
     setSchedulingPreference(Sched::RegPressure);
   else
     setSchedulingPreference(Sched::Hybrid);
@@ -1092,7 +1122,8 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
     if (!Subtarget->isAAPCS_ABI())
       return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
     else if (Subtarget->hasVFP2() &&
-             FloatABIType == FloatABI::Hard && !isVarArg)
+             getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
+             !isVarArg)
       return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
   }
@@ -2951,7 +2982,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
-  if (UnsafeFPMath &&
+  if (getTargetMachine().Options.UnsafeFPMath &&
       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
        CC == ISD::SETNE || CC == ISD::SETUNE)) {
     SDValue Result = OptimizeVFPBrcond(Op, DAG);
@@ -3978,9 +4009,8 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       }
 
       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
-      if (VT == MVT::v2f32 || VT == MVT::v4f32) {
-        ConstantFPSDNode *C = cast<ConstantFPSDNode>(Op.getOperand(0));
-        int ImmVal = ARM_AM::getFP32Imm(C->getValueAPF());
+      if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
+        int ImmVal = ARM_AM::getFP32Imm(SplatBits);
         if (ImmVal != -1) {
           SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
@@ -6010,7 +6040,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
     // executed.
     for (MachineBasicBlock::reverse_iterator
            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
-      if (!II->getDesc().isCall()) continue;
+      if (!II->isCall()) continue;
 
       DenseMap<unsigned, bool> DefRegs;
       for (MachineInstr::mop_iterator
@@ -6421,13 +6451,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                       SDNode *Node) const {
-  const MCInstrDesc *MCID = &MI->getDesc();
-  if (!MCID->hasPostISelHook()) {
+  if (!MI->hasPostISelHook()) {
     assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
            "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
     return;
   }
 
+  const MCInstrDesc *MCID = &MI->getDesc();
   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
   // operand is still set to noreg. If needed, set the optional operand's
@@ -6454,7 +6484,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
-  if (!MCID->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+  if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
     assert(!NewOpc && "Optional cc_out operand required");
     return;
   }
@@ -7948,7 +7978,7 @@ static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
     // will return -0, so vmin can only be used for unsafe math or if one of
     // the operands is known to be nonzero.
     if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
-        !UnsafeFPMath &&
+        !DAG.getTarget().Options.UnsafeFPMath &&
         !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
       break;
     Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
@@ -7970,7 +8000,7 @@ static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
     // will return +0, so vmax can only be used for unsafe math or if one of
     // the operands is known to be nonzero.
     if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
-        !UnsafeFPMath &&
+        !DAG.getTarget().Options.UnsafeFPMath &&
         !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
       break;
     Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 6940156..80f3773 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -201,21 +201,29 @@ def msr_mask : Operand<i32> {
 //     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
 //     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
 //     64       64 - <imm> is encoded in imm6<5:0>
+def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
 def shr_imm8  : Operand<i32> {
   let EncoderMethod = "getShiftRight8Imm";
   let DecoderMethod = "DecodeShiftRight8Imm";
+  let ParserMatchClass = shr_imm8_asm_operand;
 }
+def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
 def shr_imm16 : Operand<i32> {
   let EncoderMethod = "getShiftRight16Imm";
   let DecoderMethod = "DecodeShiftRight16Imm";
+  let ParserMatchClass = shr_imm16_asm_operand;
 }
+def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
 def shr_imm32 : Operand<i32> {
   let EncoderMethod = "getShiftRight32Imm";
   let DecoderMethod = "DecodeShiftRight32Imm";
+  let ParserMatchClass = shr_imm32_asm_operand;
 }
+def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
 def shr_imm64 : Operand<i32> {
   let EncoderMethod = "getShiftRight64Imm";
   let DecoderMethod = "DecodeShiftRight64Imm";
+  let ParserMatchClass = shr_imm64_asm_operand;
 }
 
 //===----------------------------------------------------------------------===//
@@ -231,6 +239,14 @@ class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
 class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
+class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasNEON]>;
+
+
+class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasVFP2]>;
+class NEONMnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasNEON]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Instruction templates.
@@ -1994,73 +2010,111 @@ class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
 
 // VFP/NEON Instruction aliases for type suffices.
 class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result> :
-  InstAlias<!strconcat(opc, dt, asm), Result>;
-multiclass VFPDT8ReqInstAlias<string opc, string asm, dag Result> {
-  def I8 : VFPDataTypeInstAlias<opc, ".i8", asm, Result>;
-  def S8 : VFPDataTypeInstAlias<opc, ".s8", asm, Result>;
-  def U8 : VFPDataTypeInstAlias<opc, ".u8", asm, Result>;
-  def F8 : VFPDataTypeInstAlias<opc, ".p8", asm, Result>;
-}
-// VFPDT8ReqInstAlias plus plain ".8"
-multiclass VFPDT8InstAlias<string opc, string asm, dag Result> {
-  def _8 : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
-  defm : VFPDT8ReqInstAlias<opc, asm, Result>;
-}
-multiclass VFPDT16ReqInstAlias<string opc, string asm, dag Result> {
-  def I16 : VFPDataTypeInstAlias<opc, ".i16", asm, Result>;
-  def S16 : VFPDataTypeInstAlias<opc, ".s16", asm, Result>;
-  def U16 : VFPDataTypeInstAlias<opc, ".u16", asm, Result>;
-  def F16 : VFPDataTypeInstAlias<opc, ".p16", asm, Result>;
-}
-// VFPDT16ReqInstAlias plus plain ".16"
-multiclass VFPDT16InstAlias<string opc, string asm, dag Result> {
-  def _16 : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
-  defm : VFPDT16ReqInstAlias<opc, asm, Result>;
-}
-multiclass VFPDT32ReqInstAlias<string opc, string asm, dag Result> {
-  def I32 : VFPDataTypeInstAlias<opc, ".i32", asm, Result>;
-  def S32 : VFPDataTypeInstAlias<opc, ".s32", asm, Result>;
-  def U32 : VFPDataTypeInstAlias<opc, ".u32", asm, Result>;
-  def F32 : VFPDataTypeInstAlias<opc, ".f32", asm, Result>;
-  def F   : VFPDataTypeInstAlias<opc, ".f",   asm, Result>;
-}
-// VFPDT32ReqInstAlias plus plain ".32"
-multiclass VFPDT32InstAlias<string opc, string asm, dag Result> {
-  def _32 : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
-  defm : VFPDT32ReqInstAlias<opc, asm, Result>;
-}
-multiclass VFPDT64ReqInstAlias<string opc, string asm, dag Result> {
-  def I64 : VFPDataTypeInstAlias<opc, ".i64", asm, Result>;
-  def S64 : VFPDataTypeInstAlias<opc, ".s64", asm, Result>;
-  def U64 : VFPDataTypeInstAlias<opc, ".u64", asm, Result>;
-  def F64 : VFPDataTypeInstAlias<opc, ".f64", asm, Result>;
-  def D   : VFPDataTypeInstAlias<opc, ".d",   asm, Result>;
-}
-// VFPDT64ReqInstAlias plus plain ".64"
-multiclass VFPDT64InstAlias<string opc, string asm, dag Result> {
-  def _64 : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
-  defm : VFPDT64ReqInstAlias<opc, asm, Result>;
-}
-multiclass VFPDT64NoF64ReqInstAlias<string opc, string asm, dag Result> {
-  def I64 : VFPDataTypeInstAlias<opc, ".i64", asm, Result>;
-  def S64 : VFPDataTypeInstAlias<opc, ".s64", asm, Result>;
-  def U64 : VFPDataTypeInstAlias<opc, ".u64", asm, Result>;
-  def D   : VFPDataTypeInstAlias<opc, ".d",   asm, Result>;
-}
-// VFPDT64ReqInstAlias plus plain ".64"
-multiclass VFPDT64NoF64InstAlias<string opc, string asm, dag Result> {
-  def _64 : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
-  defm : VFPDT64ReqInstAlias<opc, asm, Result>;
-}
+  InstAlias<!strconcat(opc, dt, "\t", asm), Result>, Requires<[HasVFP2]>;
+
 multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result> {
-  defm : VFPDT8InstAlias<opc, asm, Result>;
-  defm : VFPDT16InstAlias<opc, asm, Result>;
-  defm : VFPDT32InstAlias<opc, asm, Result>;
-  defm : VFPDT64InstAlias<opc, asm, Result>;
-}
-multiclass VFPDTAnyNoF64InstAlias<string opc, string asm, dag Result> {
-  defm : VFPDT8InstAlias<opc, asm, Result>;
-  defm : VFPDT16InstAlias<opc, asm, Result>;
-  defm : VFPDT32InstAlias<opc, asm, Result>;
-  defm : VFPDT64NoF64InstAlias<opc, asm, Result>;
-}
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+}
+
+// The same alias classes using AsmPseudo instead, for the more complex
+// stuff in NEON that InstAlias can't quite handle.
+// Note that we can't use anonymous defm references here like we can
+// above, as we care about the ultimate instruction enum names generated, unlike
+// for instalias defs.
+class NEONDataTypeAsmPseudoInst<string opc, string dt, string asm, dag iops> :
+  AsmPseudoInst<!strconcat(opc, dt, "\t", asm), iops>, Requires<[HasNEON]>;
+multiclass NEONDT8ReqAsmPseudoInst<string opc, string asm, dag iops> {
+  def I8 : NEONDataTypeAsmPseudoInst<opc, ".i8", asm, iops>;
+  def S8 : NEONDataTypeAsmPseudoInst<opc, ".s8", asm, iops>;
+  def U8 : NEONDataTypeAsmPseudoInst<opc, ".u8", asm, iops>;
+  def P8 : NEONDataTypeAsmPseudoInst<opc, ".p8", asm, iops>;
+}
+// NEONDT8ReqAsmPseudoInst plus plain ".8"
+multiclass NEONDT8AsmPseudoInst<string opc, string asm, dag iops> {
+  def _8 : NEONDataTypeAsmPseudoInst<opc, ".8", asm, iops>;
+  defm _ : NEONDT8ReqAsmPseudoInst<opc, asm, iops>;
+}
+multiclass NEONDT16ReqAsmPseudoInst<string opc, string asm, dag iops> {
+  def I16 : NEONDataTypeAsmPseudoInst<opc, ".i16", asm, iops>;
+  def S16 : NEONDataTypeAsmPseudoInst<opc, ".s16", asm, iops>;
+  def U16 : NEONDataTypeAsmPseudoInst<opc, ".u16", asm, iops>;
+  def P16 : NEONDataTypeAsmPseudoInst<opc, ".p16", asm, iops>;
+}
+// NEONDT16ReqAsmPseudoInst plus plain ".16"
+multiclass NEONDT16AsmPseudoInst<string opc, string asm, dag iops> {
+  def _16 : NEONDataTypeAsmPseudoInst<opc, ".16", asm, iops>;
+  defm _ : NEONDT16ReqAsmPseudoInst<opc, asm, iops>;
+}
+multiclass NEONDT32ReqAsmPseudoInst<string opc, string asm, dag iops> {
+  def I32 : NEONDataTypeAsmPseudoInst<opc, ".i32", asm, iops>;
+  def S32 : NEONDataTypeAsmPseudoInst<opc, ".s32", asm, iops>;
+  def U32 : NEONDataTypeAsmPseudoInst<opc, ".u32", asm, iops>;
+  def F32 : NEONDataTypeAsmPseudoInst<opc, ".f32", asm, iops>;
+  def F   : NEONDataTypeAsmPseudoInst<opc, ".f",   asm, iops>;
+}
+// NEONDT32ReqAsmPseudoInst plus plain ".32"
+multiclass NEONDT32AsmPseudoInst<string opc, string asm, dag iops> {
+  def _32 : NEONDataTypeAsmPseudoInst<opc, ".32", asm, iops>;
+  defm _ : NEONDT32ReqAsmPseudoInst<opc, asm, iops>;
+}
+multiclass NEONDT64ReqAsmPseudoInst<string opc, string asm, dag iops> {
+  def I64 : NEONDataTypeAsmPseudoInst<opc, ".i64", asm, iops>;
+  def S64 : NEONDataTypeAsmPseudoInst<opc, ".s64", asm, iops>;
+  def U64 : NEONDataTypeAsmPseudoInst<opc, ".u64", asm, iops>;
+  def F64 : NEONDataTypeAsmPseudoInst<opc, ".f64", asm, iops>;
+  def D   : NEONDataTypeAsmPseudoInst<opc, ".d",   asm, iops>;
+}
+// NEONDT64ReqAsmPseudoInst plus plain ".64"
+multiclass NEONDT64AsmPseudoInst<string opc, string asm, dag iops> {
+  def _64 : NEONDataTypeAsmPseudoInst<opc, ".64", asm, iops>;
+  defm _ : NEONDT64ReqAsmPseudoInst<opc, asm, iops>;
+}
+multiclass NEONDT64NoF64ReqAsmPseudoInst<string opc, string asm, dag iops> {
+  def I64 : NEONDataTypeAsmPseudoInst<opc, ".i64", asm, iops>;
+  def S64 : NEONDataTypeAsmPseudoInst<opc, ".s64", asm, iops>;
+  def U64 : NEONDataTypeAsmPseudoInst<opc, ".u64", asm, iops>;
+  def D   : NEONDataTypeAsmPseudoInst<opc, ".d",   asm, iops>;
+}
+// NEONDT64ReqAsmPseudoInst plus plain ".64"
+multiclass NEONDT64NoF64AsmPseudoInst<string opc, string asm, dag iops> {
+  def _64 : NEONDataTypeAsmPseudoInst<opc, ".64", asm, iops>;
+  defm _ : NEONDT64ReqAsmPseudoInst<opc, asm, iops>;
+}
+multiclass NEONDTAnyAsmPseudoInst<string opc, string asm, dag iops> {
+  defm _ : NEONDT8AsmPseudoInst<opc, asm, iops>;
+  defm _ : NEONDT16AsmPseudoInst<opc, asm, iops>;
+  defm _ : NEONDT32AsmPseudoInst<opc, asm, iops>;
+  defm _ : NEONDT64AsmPseudoInst<opc, asm, iops>;
+}
+multiclass NEONDTAnyNoF64AsmPseudoInst<string opc, string asm, dag iops> {
+  defm _ : NEONDT8AsmPseudoInst<opc, asm, iops>;
+  defm _ : NEONDT16AsmPseudoInst<opc, asm, iops>;
+  defm _ : NEONDT32AsmPseudoInst<opc, asm, iops>;
+  defm _ : NEONDT64NoF64AsmPseudoInst<opc, asm, iops>;
+}
+
+// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
+def : TokenAlias<".s8", ".i8">;
+def : TokenAlias<".u8", ".i8">;
+def : TokenAlias<".s16", ".i16">;
+def : TokenAlias<".u16", ".i16">;
+def : TokenAlias<".s32", ".i32">;
+def : TokenAlias<".u32", ".i32">;
+def : TokenAlias<".s64", ".i64">;
+def : TokenAlias<".u64", ".i64">;
+
+def : TokenAlias<".i8", ".8">;
+def : TokenAlias<".i16", ".16">;
+def : TokenAlias<".i32", ".32">;
+def : TokenAlias<".i64", ".64">;
+
+def : TokenAlias<".p8", ".8">;
+def : TokenAlias<".p16", ".16">;
+
+def : TokenAlias<".f32", ".32">;
+def : TokenAlias<".f64", ".64">;
+def : TokenAlias<".f", ".f32">;
+def : TokenAlias<".d", ".f64">;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index be03924..516a080 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -238,27 +238,23 @@ def so_imm_not_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
 }]>;
 
-/// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15].
-def imm1_15 : ImmLeaf<i32, [{
-  return (int32_t)Imm >= 1 && (int32_t)Imm < 16;
-}]>;
-
 /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
 def imm16_31 : ImmLeaf<i32, [{
   return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
 }]>;
 
-def so_imm_neg :
-  PatLeaf<(imm), [{
+def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
+def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
     return ARM_AM::getSOImmVal(-(uint32_t)N->getZExtValue()) != -1;
-  }], so_imm_neg_XFORM>;
+  }], so_imm_neg_XFORM> {
+  let ParserMatchClass = so_imm_neg_asmoperand;
+}
 
 // Note: this pattern doesn't require an encoder method and such, as it's
 // only used on aliases (Pat<> and InstAlias<>). The actual encoding
-// is handled by the destination instructions, which use t2_so_imm.
+// is handled by the destination instructions, which use so_imm.
 def so_imm_not_asmoperand : AsmOperandClass { let Name = "ARMSOImmNot"; }
-def so_imm_not :
-  Operand<i32>, PatLeaf<(imm), [{
+def so_imm_not : Operand<i32>, PatLeaf<(imm), [{
     return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
   }], so_imm_not_XFORM> {
   let ParserMatchClass = so_imm_not_asmoperand;
@@ -512,6 +508,14 @@ def arm_i32imm : PatLeaf<(imm), [{
   return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
 }]>;
 
+/// imm0_1 predicate - Immediate in the range [0,1].
+def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
+
+/// imm0_3 predicate - Immediate in the range [0,3].
+def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
+
 /// imm0_7 predicate - Immediate in the range [0,7].
 def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
@@ -520,6 +524,42 @@ def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_7AsmOperand;
 }
 
+/// imm8 predicate - Immediate is exactly 8.
+def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
+  let ParserMatchClass = Imm8AsmOperand;
+}
+
+/// imm16 predicate - Immediate is exactly 16.
+def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
+  let ParserMatchClass = Imm16AsmOperand;
+}
+
+/// imm32 predicate - Immediate is exactly 32.
+def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
+  let ParserMatchClass = Imm32AsmOperand;
+}
+
+/// imm1_7 predicate - Immediate in the range [1,7].
+def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
+  let ParserMatchClass = Imm1_7AsmOperand;
+}
+
+/// imm1_15 predicate - Immediate in the range [1,15].
+def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
+  let ParserMatchClass = Imm1_15AsmOperand;
+}
+
+/// imm1_31 predicate - Immediate in the range [1,31].
+def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
+  let ParserMatchClass = Imm1_31AsmOperand;
+}
+
 /// imm0_15 predicate - Immediate in the range [0,15].
 def Imm0_15AsmOperand: ImmAsmOperand { let Name = "Imm0_15"; }
 def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
@@ -544,6 +584,14 @@ def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_32AsmOperand;
 }
 
+/// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
+def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 64;
+}]> {
+  let ParserMatchClass = Imm0_63AsmOperand;
+}
+
 /// imm0_255 predicate - Immediate in the range [0,255].
 def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
@@ -812,6 +860,9 @@ def addrmode6dup : Operand<i32>,
   let PrintMethod = "printAddrMode6Operand";
   let MIOperandInfo = (ops GPR:$addr, i32imm);
   let EncoderMethod = "getAddrMode6DupAddressOpValue";
+  // FIXME: This is close, but not quite right. The alignment specifier is
+  // different.
+  let ParserMatchClass = AddrMode6AsmOperand;
 }
 
 // addrmodepc := pc + reg
@@ -2753,23 +2804,25 @@ defm STRHT : AI3strT<0b1011, "strht">;
 //  Load / store multiple Instructions.
 //
 
-multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
+multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
                          InstrItinClass itin, InstrItinClass itin_upd> {
   // IA is the default, so no need for an explicit suffix on the
   // mnemonic here. Without it is the cannonical spelling.
   def IA :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeNone, f, itin,
-         !strconcat(asm, "${p}\t$Rn, $regs"), "", []> {
+         !strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> {
     let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
     let Inst{21}    = 0;          // No writeback
     let Inst{20}    = L_bit;
   }
   def IA_UPD :
     AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeUpd, f, itin_upd,
-         !strconcat(asm, "${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+         !strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
     let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
 
@@ -2778,16 +2831,18 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
   def DA :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeNone, f, itin,
-         !strconcat(asm, "da${p}\t$Rn, $regs"), "", []> {
+         !strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> {
     let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
     let Inst{21}    = 0;          // No writeback
     let Inst{20}    = L_bit;
   }
   def DA_UPD :
     AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeUpd, f, itin_upd,
-         !strconcat(asm, "da${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+         !strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
     let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
 
@@ -2796,16 +2851,18 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
   def DB :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeNone, f, itin,
-         !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> {
+         !strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> {
     let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
     let Inst{21}    = 0;          // No writeback
     let Inst{20}    = L_bit;
   }
   def DB_UPD :
     AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeUpd, f, itin_upd,
-         !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+         !strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
     let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
 
@@ -2814,16 +2871,18 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
   def IB :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeNone, f, itin,
-         !strconcat(asm, "ib${p}\t$Rn, $regs"), "", []> {
+         !strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> {
     let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
     let Inst{21}    = 0;          // No writeback
     let Inst{20}    = L_bit;
   }
   def IB_UPD :
     AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeUpd, f, itin_upd,
-         !strconcat(asm, "ib${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+         !strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
     let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
 
@@ -2834,10 +2893,12 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
 let neverHasSideEffects = 1 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
-defm LDM : arm_ldst_mult<"ldm", 1, LdStMulFrm, IIC_iLoad_m, IIC_iLoad_mu>;
+defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
+                         IIC_iLoad_mu>;
 
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
-defm STM : arm_ldst_mult<"stm", 0, LdStMulFrm, IIC_iStore_m, IIC_iStore_mu>;
+defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m,
+                         IIC_iStore_mu>;
 
 } // neverHasSideEffects
 
@@ -2851,6 +2912,16 @@ def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
                      (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
       RegConstraint<"$Rn = $wb">;
 
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m,
+                               IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
+                               IIC_iStore_mu>;
+
+
+
 //===----------------------------------------------------------------------===//
 //  Move Instructions.
 //
@@ -4999,6 +5070,32 @@ def : MnemonicAlias<"usubaddx", "usax">;
 // for isel.
 def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
                    (MVNi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+                   (MOVi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+// Same for AND <--> BIC
+def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                   (ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+                   (ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                   (BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+                   (BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+
+// Likewise, "add Rd, so_imm_neg" -> sub
+def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rn, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rd, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via so_imm_neg
+def : ARMInstAlias<"cmp${p} $Rd, $imm",
+                   (CMNzri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+def : ARMInstAlias<"cmn${p} $Rd, $imm",
+                   (CMPri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
 
 // The shifter forms of the MOV instruction are aliased to the ASR, LSL,
 // LSR, ROR, and RRX instructions.
@@ -5056,4 +5153,8 @@ def : ARMInstAlias<"ror${s}${p} $Rn, $Rm",
 
 // 'mul' instruction can be specified with only two operands.
 def : ARMInstAlias<"mul${s}${p} $Rn, $Rm",
-                   (MUL rGPR:$Rn, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+                   (MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p, cc_out:$s)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
+                   (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index f2ca963..c40860d 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -74,9 +74,11 @@ def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
   let MIOperandInfo = (ops i32imm);
 }
 
+// Register list of one D register.
 def VecListOneDAsmOperand : AsmOperandClass {
   let Name = "VecListOneD";
   let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
 }
 def VecListOneD : RegisterOperand<DPR, "printVectorListOne"> {
   let ParserMatchClass = VecListOneDAsmOperand;
@@ -85,6 +87,7 @@ def VecListOneD : RegisterOperand<DPR, "printVectorListOne"> {
 def VecListTwoDAsmOperand : AsmOperandClass {
   let Name = "VecListTwoD";
   let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
 }
 def VecListTwoD : RegisterOperand<DPR, "printVectorListTwo"> {
   let ParserMatchClass = VecListTwoDAsmOperand;
@@ -93,6 +96,7 @@ def VecListTwoD : RegisterOperand<DPR, "printVectorListTwo"> {
 def VecListThreeDAsmOperand : AsmOperandClass {
   let Name = "VecListThreeD";
   let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
 }
 def VecListThreeD : RegisterOperand<DPR, "printVectorListThree"> {
   let ParserMatchClass = VecListThreeDAsmOperand;
@@ -101,6 +105,7 @@ def VecListThreeD : RegisterOperand<DPR, "printVectorListThree"> {
 def VecListFourDAsmOperand : AsmOperandClass {
   let Name = "VecListFourD";
   let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
 }
 def VecListFourD : RegisterOperand<DPR, "printVectorListFour"> {
   let ParserMatchClass = VecListFourDAsmOperand;
@@ -109,11 +114,92 @@ def VecListFourD : RegisterOperand<DPR, "printVectorListFour"> {
 def VecListTwoQAsmOperand : AsmOperandClass {
   let Name = "VecListTwoQ";
   let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
 }
-def VecListTwoQ : RegisterOperand<DPR, "printVectorListTwo"> {
+def VecListTwoQ : RegisterOperand<DPR, "printVectorListTwoSpaced"> {
   let ParserMatchClass = VecListTwoQAsmOperand;
 }
 
+// Register list of one D register, with "all lanes" subscripting.
+def VecListOneDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListOneDAllLanes : RegisterOperand<DPR, "printVectorListOneAllLanes"> {
+  let ParserMatchClass = VecListOneDAllLanesAsmOperand;
+}
+// Register list of two D registers, with "all lanes" subscripting.
+def VecListTwoDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListTwoDAllLanes : RegisterOperand<DPR, "printVectorListTwoAllLanes"> {
+  let ParserMatchClass = VecListTwoDAllLanesAsmOperand;
+}
+
+// Register list of one D register, with byte lane subscripting.
+def VecListOneDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListOneDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListOneDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of two D registers, with byte lane subscripting.
+def VecListTwoDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListTwoDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
 //===----------------------------------------------------------------------===//
 // NEON-specific DAG Nodes.
 //===----------------------------------------------------------------------===//
@@ -272,12 +358,23 @@ class VLDQWBregisterPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
                 (ins addrmode6:$addr, rGPR:$offset), itin,
                 "$addr.addr = $wb">;
+
 class VLDQQPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">;
 class VLDQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
                 (ins addrmode6:$addr, am6offset:$offset), itin,
                 "$addr.addr = $wb">;
+class VLDQQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr), itin,
+                "$addr.addr = $wb">;
+class VLDQQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset), itin,
+                "$addr.addr = $wb">;
+
+
 class VLDQQQQPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,
                 "$src = $dst">;
@@ -462,31 +559,23 @@ defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64">;
 def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
-class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy>
+class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+           InstrItinClass itin>
   : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD2,
-          "vld2", Dt, "$Vd, $Rn", "", []> {
-  let Rm = 0b1111;
-  let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
-}
-class VLD2Q<bits<4> op7_4, string Dt, RegisterOperand VdTy>
-  : NLdSt<0, 0b10, 0b0011, op7_4,
-          (outs VdTy:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD2x2,
+          (ins addrmode6:$Rn), itin,
           "vld2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDInstruction";
 }
 
-def  VLD2d8   : VLD2D<0b1000, {0,0,?,?}, "8", VecListTwoD>;
-def  VLD2d16  : VLD2D<0b1000, {0,1,?,?}, "16", VecListTwoD>;
-def  VLD2d32  : VLD2D<0b1000, {1,0,?,?}, "32", VecListTwoD>;
+def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListTwoD, IIC_VLD2>;
+def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListTwoD, IIC_VLD2>;
+def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListTwoD, IIC_VLD2>;
 
-def  VLD2q8   : VLD2Q<{0,0,?,?}, "8", VecListFourD>;
-def  VLD2q16  : VLD2Q<{0,1,?,?}, "16", VecListFourD>;
-def  VLD2q32  : VLD2Q<{1,0,?,?}, "32", VecListFourD>;
+def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2>;
+def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2>;
+def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2>;
 
 def  VLD2d8Pseudo  : VLDQPseudo<IIC_VLD2>;
 def  VLD2d16Pseudo : VLDQPseudo<IIC_VLD2>;
@@ -497,47 +586,56 @@ def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
 def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
 
 // ...with address register writeback:
-class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy>
-  : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
-          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2u,
-          "vld2", Dt, "$Vd, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
-  let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
-}
-class VLD2QWB<bits<4> op7_4, string Dt, RegisterOperand VdTy>
-  : NLdSt<0, 0b10, 0b0011, op7_4,
-          (outs VdTy:$Vd, GPR:$wb),
-          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2x2u,
-          "vld2", Dt, "$Vd, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
-  let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                  RegisterOperand VdTy, InstrItinClass itin> {
+  def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                     (ins addrmode6:$Rn), itin,
+                     "vld2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm), itin,
+                        "vld2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
 }
 
-def VLD2d8_UPD  : VLD2DWB<0b1000, {0,0,?,?}, "8", VecListTwoD>;
-def VLD2d16_UPD : VLD2DWB<0b1000, {0,1,?,?}, "16", VecListTwoD>;
-def VLD2d32_UPD : VLD2DWB<0b1000, {1,0,?,?}, "32", VecListTwoD>;
+defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListTwoD, IIC_VLD2u>;
+defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListTwoD, IIC_VLD2u>;
+defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListTwoD, IIC_VLD2u>;
 
-def VLD2q8_UPD  : VLD2QWB<{0,0,?,?}, "8", VecListFourD>;
-def VLD2q16_UPD : VLD2QWB<{0,1,?,?}, "16", VecListFourD>;
-def VLD2q32_UPD : VLD2QWB<{1,0,?,?}, "32", VecListFourD>;
+defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u>;
+defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u>;
+defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u>;
 
-def VLD2d8Pseudo_UPD  : VLDQWBPseudo<IIC_VLD2u>;
-def VLD2d16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>;
-def VLD2d32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>;
+def VLD2d8PseudoWB_fixed     : VLDQWBfixedPseudo<IIC_VLD2u>;
+def VLD2d16PseudoWB_fixed    : VLDQWBfixedPseudo<IIC_VLD2u>;
+def VLD2d32PseudoWB_fixed    : VLDQWBfixedPseudo<IIC_VLD2u>;
+def VLD2d8PseudoWB_register  : VLDQWBregisterPseudo<IIC_VLD2u>;
+def VLD2d16PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD2u>;
+def VLD2d32PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD2u>;
 
-def VLD2q8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD2x2u>;
-def VLD2q16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>;
-def VLD2q32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
 
 // ...with double-spaced registers
-def VLD2b8      : VLD2D<0b1001, {0,0,?,?}, "8", VecListTwoQ>;
-def VLD2b16     : VLD2D<0b1001, {0,1,?,?}, "16", VecListTwoQ>;
-def VLD2b32     : VLD2D<0b1001, {1,0,?,?}, "32", VecListTwoQ>;
-def VLD2b8_UPD  : VLD2DWB<0b1001, {0,0,?,?}, "8", VecListTwoQ>;
-def VLD2b16_UPD : VLD2DWB<0b1001, {0,1,?,?}, "16", VecListTwoQ>;
-def VLD2b32_UPD : VLD2DWB<0b1001, {1,0,?,?}, "32", VecListTwoQ>;
+def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListTwoQ, IIC_VLD2>;
+def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListTwoQ, IIC_VLD2>;
+def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListTwoQ, IIC_VLD2>;
+defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListTwoQ, IIC_VLD2u>;
+defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListTwoQ, IIC_VLD2u>;
+defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListTwoQ, IIC_VLD2u>;
 
 //   VLD3     : Vector Load (multiple 3-element structures)
 class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -997,9 +1095,11 @@ def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
 class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
-  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd), (ins addrmode6dup:$Rn),
-          IIC_VLD1dup, "vld1", Dt, "\\{$Vd[]\\}, $Rn", "",
-          [(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd),
+          (ins addrmode6dup:$Rn),
+          IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListOneDAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1025,9 +1125,9 @@ def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 
 class VLD1QDUP<bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2),
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListTwoDAllLanes:$Vd),
           (ins addrmode6dup:$Rn), IIC_VLD1dup,
-          "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
+          "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1038,32 +1138,63 @@ def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16">;
 def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32">;
 
 // ...with address register writeback:
-class VLD1DUPWB<bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, GPR:$wb),
-          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
-          "vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
-  let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLD1DupInstruction";
+multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
 }
-class VLD1QDUPWB<bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
-          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
-          "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
-  let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLD1DupInstruction";
+multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListTwoDAllLanes:$Vd, GPR:$wb),
+                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListTwoDAllLanes:$Vd, GPR:$wb),
+                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
 }
 
-def VLD1DUPd8_UPD  : VLD1DUPWB<{0,0,0,0}, "8">;
-def VLD1DUPd16_UPD : VLD1DUPWB<{0,1,0,?}, "16">;
-def VLD1DUPd32_UPD : VLD1DUPWB<{1,0,0,?}, "32">;
+defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8">;
+defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16">;
+defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32">;
 
-def VLD1DUPq8_UPD  : VLD1QDUPWB<{0,0,1,0}, "8">;
-def VLD1DUPq16_UPD : VLD1QDUPWB<{0,1,1,?}, "16">;
-def VLD1DUPq32_UPD : VLD1QDUPWB<{1,0,1,?}, "32">;
+defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8">;
+defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16">;
+defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32">;
 
-def VLD1DUPq8Pseudo_UPD  : VLDQWBPseudo<IIC_VLD1dupu>;
-def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
-def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
+def VLD1DUPq8PseudoWB_fixed     : VLDQWBfixedPseudo<IIC_VLD1dupu>;
+def VLD1DUPq16PseudoWB_fixed    : VLDQWBfixedPseudo<IIC_VLD1dupu>;
+def VLD1DUPq32PseudoWB_fixed    : VLDQWBfixedPseudo<IIC_VLD1dupu>;
+def VLD1DUPq8PseudoWB_register  : VLDQWBregisterPseudo<IIC_VLD1dupu>;
+def VLD1DUPq16PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD1dupu>;
+def VLD1DUPq32PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD1dupu>;
 
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
 class VLD2DUP<bits<4> op7_4, string Dt>
@@ -1329,94 +1460,109 @@ def VST1q64PseudoWB_register : VSTQWBregisterPseudo<IIC_VST1x2u>;
 // ...with 3 registers
 class VST1D3<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
-          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3),
-          IIC_VST1x3, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+          (ins addrmode6:$Rn, VecListThreeD:$Vd),
+          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVSTInstruction";
 }
-class VST1D3WB<bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$Rn, am6offset:$Rm,
-           DPR:$Vd, DPR:$src2, DPR:$src3),
-          IIC_VST1x3u, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
-  let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+multiclass VST1D3WB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                    (ins addrmode6:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
+                        IIC_VLD1x3u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
 }
 
-def VST1d8T      : VST1D3<{0,0,0,?}, "8">;
-def VST1d16T     : VST1D3<{0,1,0,?}, "16">;
-def VST1d32T     : VST1D3<{1,0,0,?}, "32">;
-def VST1d64T     : VST1D3<{1,1,0,?}, "64">;
+def VST1d8T     : VST1D3<{0,0,0,?}, "8">;
+def VST1d16T    : VST1D3<{0,1,0,?}, "16">;
+def VST1d32T    : VST1D3<{1,0,0,?}, "32">;
+def VST1d64T    : VST1D3<{1,1,0,?}, "64">;
 
-def VST1d8T_UPD  : VST1D3WB<{0,0,0,?}, "8">;
-def VST1d16T_UPD : VST1D3WB<{0,1,0,?}, "16">;
-def VST1d32T_UPD : VST1D3WB<{1,0,0,?}, "32">;
-def VST1d64T_UPD : VST1D3WB<{1,1,0,?}, "64">;
+defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8">;
+defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16">;
+defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32">;
+defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64">;
 
-def VST1d64TPseudo     : VSTQQPseudo<IIC_VST1x3>;
-def VST1d64TPseudo_UPD : VSTQQWBPseudo<IIC_VST1x3u>;
+def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
+def VST1d64TPseudoWB_fixed    : VSTQQWBPseudo<IIC_VST1x3u>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
 
 // ...with 4 registers
 class VST1D4<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
-          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST1x4, "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", "",
+          (ins addrmode6:$Rn, VecListFourD:$Vd),
+          IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
           []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVSTInstruction";
 }
-class VST1D4WB<bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$Rn, am6offset:$Rm,
-           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST1x4u,
-          "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
-  let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+multiclass VST1D4WB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                    (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1x4u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
 }
 
-def VST1d8Q      : VST1D4<{0,0,?,?}, "8">;
-def VST1d16Q     : VST1D4<{0,1,?,?}, "16">;
-def VST1d32Q     : VST1D4<{1,0,?,?}, "32">;
-def VST1d64Q     : VST1D4<{1,1,?,?}, "64">;
+def VST1d8Q     : VST1D4<{0,0,?,?}, "8">;
+def VST1d16Q    : VST1D4<{0,1,?,?}, "16">;
+def VST1d32Q    : VST1D4<{1,0,?,?}, "32">;
+def VST1d64Q    : VST1D4<{1,1,?,?}, "64">;
 
-def VST1d8Q_UPD  : VST1D4WB<{0,0,?,?}, "8">;
-def VST1d16Q_UPD : VST1D4WB<{0,1,?,?}, "16">;
-def VST1d32Q_UPD : VST1D4WB<{1,0,?,?}, "32">;
-def VST1d64Q_UPD : VST1D4WB<{1,1,?,?}, "64">;
+defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8">;
+defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16">;
+defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32">;
+defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64">;
 
-def VST1d64QPseudo     : VSTQQPseudo<IIC_VST1x4>;
-def VST1d64QPseudo_UPD : VSTQQWBPseudo<IIC_VST1x4u>;
+def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
+def VST1d64QPseudoWB_fixed    : VSTQQWBPseudo<IIC_VST1x4u>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
 
 //   VST2     : Vector Store (multiple 2-element structures)
-class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2),
-          IIC_VST2, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> {
-  let Rm = 0b1111;
-  let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
-}
-class VST2Q<bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, 0b0011, op7_4, (outs),
-          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST2x2, "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
-          "", []> {
+class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+            InstrItinClass itin>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, VdTy:$Vd),
+          itin, "vst2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVSTInstruction";
 }
 
-def  VST2d8   : VST2D<0b1000, {0,0,?,?}, "8">;
-def  VST2d16  : VST2D<0b1000, {0,1,?,?}, "16">;
-def  VST2d32  : VST2D<0b1000, {1,0,?,?}, "32">;
+def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListTwoD, IIC_VST2>;
+def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListTwoD, IIC_VST2>;
+def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListTwoD, IIC_VST2>;
 
-def  VST2q8   : VST2Q<{0,0,?,?}, "8">;
-def  VST2q16  : VST2Q<{0,1,?,?}, "16">;
-def  VST2q32  : VST2Q<{1,0,?,?}, "32">;
+def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2>;
+def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2>;
+def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2>;
 
 def  VST2d8Pseudo  : VSTQPseudo<IIC_VST2>;
 def  VST2d16Pseudo : VSTQPseudo<IIC_VST2>;
@@ -1427,47 +1573,76 @@ def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
 def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
 
 // ...with address register writeback:
-class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2),
-          IIC_VST2u, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
-  let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                   RegisterOperand VdTy> {
+  def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                     (ins addrmode6:$Rn, VdTy:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
 }
-class VST2QWB<bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$Rn, am6offset:$Rm,
-           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST2x2u,
-          "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
-  let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+multiclass VST2QWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                     (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
 }
 
-def VST2d8_UPD  : VST2DWB<0b1000, {0,0,?,?}, "8">;
-def VST2d16_UPD : VST2DWB<0b1000, {0,1,?,?}, "16">;
-def VST2d32_UPD : VST2DWB<0b1000, {1,0,?,?}, "32">;
+defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListTwoD>;
+defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListTwoD>;
+defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListTwoD>;
 
-def VST2q8_UPD  : VST2QWB<{0,0,?,?}, "8">;
-def VST2q16_UPD : VST2QWB<{0,1,?,?}, "16">;
-def VST2q32_UPD : VST2QWB<{1,0,?,?}, "32">;
+defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8">;
+defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16">;
+defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32">;
 
-def VST2d8Pseudo_UPD  : VSTQWBPseudo<IIC_VST2u>;
-def VST2d16Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>;
-def VST2d32Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>;
+def VST2d8PseudoWB_fixed     : VSTQWBPseudo<IIC_VST2u>;
+def VST2d16PseudoWB_fixed    : VSTQWBPseudo<IIC_VST2u>;
+def VST2d32PseudoWB_fixed    : VSTQWBPseudo<IIC_VST2u>;
+def VST2d8PseudoWB_register  : VSTQWBPseudo<IIC_VST2u>;
+def VST2d16PseudoWB_register : VSTQWBPseudo<IIC_VST2u>;
+def VST2d32PseudoWB_register : VSTQWBPseudo<IIC_VST2u>;
 
-def VST2q8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST2x2u>;
-def VST2q16Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>;
-def VST2q32Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_fixed     : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_fixed    : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_fixed    : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_register  : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_register : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_register : VSTQQWBPseudo<IIC_VST2x2u>;
 
 // ...with double-spaced registers
-def VST2b8      : VST2D<0b1001, {0,0,?,?}, "8">;
-def VST2b16     : VST2D<0b1001, {0,1,?,?}, "16">;
-def VST2b32     : VST2D<0b1001, {1,0,?,?}, "32">;
-def VST2b8_UPD  : VST2DWB<0b1001, {0,0,?,?}, "8">;
-def VST2b16_UPD : VST2DWB<0b1001, {0,1,?,?}, "16">;
-def VST2b32_UPD : VST2DWB<0b1001, {1,0,?,?}, "32">;
+def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListTwoQ, IIC_VST2>;
+def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListTwoQ, IIC_VST2>;
+def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListTwoQ, IIC_VST2>;
+defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListTwoQ>;
+defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListTwoQ>;
+defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListTwoQ>;
 
 //   VST3     : Vector Store (multiple 3-element structures)
 class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1741,10 +1916,10 @@ def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
 // ...with address register writeback:
 class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt,
-          "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset",
-          "$addr.addr = $wb", []> {
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt,
+          "\\{$Vd[$lane], $src2[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
   let Inst{4}   = Rn{4};
   let DecoderMethod = "DecodeVST2LN";
 }
@@ -2573,9 +2748,9 @@ class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 // Long shift by immediate.
 class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              string OpcodeStr, string Dt,
-             ValueType ResTy, ValueType OpTy, SDNode OpNode>
+             ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs QPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), N2RegVShLFrm,
+           (outs QPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), N2RegVShLFrm,
            IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm),
                                           (i32 imm:$SIMM))))]>;
@@ -2805,14 +2980,11 @@ multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                    v4i32, v4i32, OpNode, Commutable>;
 }
 
-multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, string Dt, SDNode ShOp> {
-  def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"),
-                       v4i16, ShOp>;
-  def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, !strconcat(Dt,"32"),
-                     v2i32, ShOp>;
-  def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"),
-                       v8i16, v4i16, ShOp>;
-  def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, !strconcat(Dt,"32"),
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
+  def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, "i16", v4i16, ShOp>;
+  def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, "i32", v2i32, ShOp>;
+  def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, "i16", v8i16, v4i16, ShOp>;
+  def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, "i32",
                      v4i32, v2i32, ShOp>;
 }
 
@@ -3477,15 +3649,15 @@ multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
                       bit op4, string OpcodeStr, string Dt, SDNode OpNode> {
   def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
-                 OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode> {
+              OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, imm1_7, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
   def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
-                  OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode> {
+               OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, imm1_15, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
   def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
-                  OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode> {
+               OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, imm1_31, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
 }
@@ -3574,7 +3746,7 @@ def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
                      v2f32, v2f32, fmul, 1>;
 def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
                      v4f32, v4f32, fmul, 1>;
-defm VMULsl   : N3VSL_HS<0b1000, "vmul", "i", mul>;
+defm VMULsl   : N3VSL_HS<0b1000, "vmul", mul>;
 def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
 def  VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
                        v2f32, fmul>;
@@ -4285,18 +4457,18 @@ defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", NEONvshllu>;
 //   VSHLL    : Vector Shift Left Long (with maximum shift count)
 class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
                 bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy,
-                ValueType OpTy, SDNode OpNode>
+                ValueType OpTy, Operand ImmTy, SDNode OpNode>
   : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
-           ResTy, OpTy, OpNode> {
+           ResTy, OpTy, ImmTy, OpNode> {
   let Inst{21-16} = op21_16;
   let DecoderMethod = "DecodeVSHLMaxInstruction";
 }
 def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
-                          v8i16, v8i8, NEONvshlli>;
+                          v8i16, v8i8, imm8, NEONvshlli>;
 def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
-                          v4i32, v4i16, NEONvshlli>;
+                          v4i32, v4i16, imm16, NEONvshlli>;
 def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
-                          v2i64, v2i32, NEONvshlli>;
+                          v2i64, v2i32, imm32, NEONvshlli>;
 
 //   VSHRN    : Vector Shift Right and Narrow
 defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
@@ -4469,10 +4641,6 @@ def : InstAlias<"vmov${p} $Vd, $Vm",
                 (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
 def : InstAlias<"vmov${p} $Vd, $Vm",
                 (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
-defm : VFPDTAnyNoF64InstAlias<"vmov${p}", "$Vd, $Vm",
-                              (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
-defm : VFPDTAnyNoF64InstAlias<"vmov${p}", "$Vd, $Vm",
-                              (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -4932,34 +5100,34 @@ def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
 
 //   VEXT     : Vector Extract
 
-class VEXTd<string OpcodeStr, string Dt, ValueType Ty>
+class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
   : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
-        (ins DPR:$Vn, DPR:$Vm, i32imm:$index), NVExtFrm,
+        (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm,
         IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
         [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
-                                      (Ty DPR:$Vm), imm:$index)))]> {
+                                     (Ty DPR:$Vm), imm:$index)))]> {
   bits<4> index;
   let Inst{11-8} = index{3-0};
 }
 
-class VEXTq<string OpcodeStr, string Dt, ValueType Ty>
+class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
   : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd),
-        (ins QPR:$Vn, QPR:$Vm, i32imm:$index), NVExtFrm,
+        (ins QPR:$Vn, QPR:$Vm, imm0_15:$index), NVExtFrm,
         IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
         [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn),
-                                      (Ty QPR:$Vm), imm:$index)))]> {
+                                     (Ty QPR:$Vm), imm:$index)))]> {
   bits<4> index;
   let Inst{11-8} = index{3-0};
 }
 
-def VEXTd8  : VEXTd<"vext", "8",  v8i8> {
+def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
   let Inst{11-8} = index{3-0};
 }
-def VEXTd16 : VEXTd<"vext", "16", v4i16> {
+def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
   let Inst{11-9} = index{2-0};
   let Inst{8}    = 0b0;
 }
-def VEXTd32 : VEXTd<"vext", "32", v2i32> {
+def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
   let Inst{11-10} = index{1-0};
   let Inst{9-8}    = 0b00;
 }
@@ -4968,17 +5136,21 @@ def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
                            (i32 imm:$index))),
           (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>;
 
-def VEXTq8  : VEXTq<"vext", "8",  v16i8> {
+def VEXTq8  : VEXTq<"vext", "8",  v16i8, imm0_15> {
   let Inst{11-8} = index{3-0};
 }
-def VEXTq16 : VEXTq<"vext", "16", v8i16> {
+def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> {
   let Inst{11-9} = index{2-0};
   let Inst{8}    = 0b0;
 }
-def VEXTq32 : VEXTq<"vext", "32", v4i32> {
+def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> {
   let Inst{11-10} = index{1-0};
   let Inst{9-8}    = 0b00;
 }
+def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> {
+  let Inst{11} = index{0};
+  let Inst{10-8}    = 0b000;
+}
 def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn),
                            (v4f32 QPR:$Vm),
                            (i32 imm:$index))),
@@ -5026,17 +5198,17 @@ def  VTBL1
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBL2
   : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd),
-        (ins DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTB2,
-        "vtbl", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "", []>;
+        (ins VecListTwoD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB2,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
 def  VTBL3
   : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd),
-        (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm), NVTBLFrm, IIC_VTB3,
-        "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm", "", []>;
+        (ins VecListThreeD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB3,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
 def  VTBL4
   : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd),
-        (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm),
+        (ins VecListFourD:$Vn, DPR:$Vm),
         NVTBLFrm, IIC_VTB4,
-        "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm", "", []>;
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
 } // hasExtraSrcRegAllocReq = 1
 
 def  VTBL2Pseudo
@@ -5056,18 +5228,18 @@ def  VTBX1
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBX2
   : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd),
-        (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTBX2,
-        "vtbx", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "$orig = $Vd", []>;
+        (ins DPR:$orig, VecListTwoD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX2,
+        "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd", []>;
 def  VTBX3
   : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd),
-        (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm),
+        (ins DPR:$orig, VecListThreeD:$Vn, DPR:$Vm),
         NVTBLFrm, IIC_VTBX3,
-        "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm",
+        "vtbx", "8", "$Vd, $Vn, $Vm",
         "$orig = $Vd", []>;
 def  VTBX4
-  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd), (ins DPR:$orig, DPR:$Vn,
-        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm), NVTBLFrm, IIC_VTBX4,
-        "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm",
+  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListFourD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX4,
+        "vtbx", "8", "$Vd, $Vn, $Vm",
         "$orig = $Vd", []>;
 } // hasExtraSrcRegAllocReq = 1
 
@@ -5207,11 +5379,83 @@ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
 // Assembler aliases
 //
 
-// VAND/VEOR/VORR accept but do not require a type suffix.
+def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 1, pred:$p)>;
+def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>;
+
+
+// VADD two-operand aliases.
+def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm",
+                    (VADDv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm",
+                    (VADDv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm",
+                    (VADDv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm",
+                    (VADDv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm",
+                    (VADDv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm",
+                    (VADDv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm",
+                    (VADDv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm",
+                    (VADDv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm",
+                    (VADDfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm",
+                    (VADDfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// VSUB two-operand aliases.
+def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm",
+                    (VSUBv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm",
+                    (VSUBv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm",
+                    (VSUBv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm",
+                    (VSUBv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm",
+                    (VSUBv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm",
+                    (VSUBv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm",
+                    (VSUBv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm",
+                    (VSUBv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm",
+                    (VSUBfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm",
+                    (VSUBfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// VADDW two-operand aliases.
+def : NEONInstAlias<"vaddw${p}.s8 $Vdn, $Vm",
+                    (VADDWsv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vaddw${p}.s16 $Vdn, $Vm",
+                    (VADDWsv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vaddw${p}.s32 $Vdn, $Vm",
+                    (VADDWsv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vaddw${p}.u8 $Vdn, $Vm",
+                    (VADDWuv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vaddw${p}.u16 $Vdn, $Vm",
+                    (VADDWuv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vaddw${p}.u32 $Vdn, $Vm",
+                    (VADDWuv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+// VAND/VBIC/VEOR/VORR accept but do not require a type suffix.
 defm : VFPDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
                          (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
 defm : VFPDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
                          (VANDq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
 defm : VFPDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
                          (VEORd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
 defm : VFPDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
@@ -5220,245 +5464,450 @@ defm : VFPDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
                          (VORRd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
 defm : VFPDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
                          (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
-
-// VLD1 requires a size suffix, but also accepts type specific variants.
-// Load one D register.
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d8 VecListOneD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d16 VecListOneD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d32 VecListOneD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d64 VecListOneD:$Vd, addrmode6:$Rn, pred:$p)>;
-//        with writeback, fixed stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1d8wb_fixed VecListOneD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1d16wb_fixed VecListOneD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1d32wb_fixed VecListOneD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1d64wb_fixed VecListOneD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-//        with writeback, register stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1d8wb_register  VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1d16wb_register VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1d32wb_register VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1d64wb_register VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-
-// Load two D registers.
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1q8 VecListTwoD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1q16 VecListTwoD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1q32 VecListTwoD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1q64 VecListTwoD:$Vd, addrmode6:$Rn, pred:$p)>;
-//        with writeback, fixed stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1q8wb_fixed VecListTwoD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1q16wb_fixed VecListTwoD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1q32wb_fixed VecListTwoD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-          (VLD1q64wb_fixed VecListTwoD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
-//        with writeback, register stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1q8wb_register  VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1q16wb_register VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1q32wb_register VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-          (VLD1q64wb_register VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, pred:$p)>;
-
-// Load three D registers.
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d8T VecListThreeD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d16T VecListThreeD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d32T VecListThreeD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d64T VecListThreeD:$Vd, addrmode6:$Rn, pred:$p)>;
-//        with writeback, fixed stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d8Twb_fixed VecListThreeD:$Vd, zero_reg,
-                                           addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d16Twb_fixed VecListThreeD:$Vd, zero_reg,
-                                            addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d32Twb_fixed VecListThreeD:$Vd, zero_reg,
-                                            addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d64Twb_fixed VecListThreeD:$Vd, zero_reg,
-                                            addrmode6:$Rn, pred:$p)>;
-//        with writeback, register stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                        (VLD1d8Twb_register VecListThreeD:$Vd, zero_reg,
-                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                        (VLD1d16Twb_register VecListThreeD:$Vd, zero_reg,
-                                             addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                        (VLD1d32Twb_register VecListThreeD:$Vd, zero_reg,
-                                             addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                        (VLD1d64Twb_register VecListThreeD:$Vd, zero_reg,
-                                             addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-
-
-// Load four D registers.
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d8Q VecListFourD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d16Q VecListFourD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d32Q VecListFourD:$Vd, addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn",
-                          (VLD1d64Q VecListFourD:$Vd, addrmode6:$Rn, pred:$p)>;
-//        with writeback, fixed stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d8Qwb_fixed VecListFourD:$Vd, zero_reg,
-                                           addrmode6:$Rn, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d16Qwb_fixed VecListFourD:$Vd, zero_reg,
-                                            addrmode6:$Rn, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d32Qwb_fixed VecListFourD:$Vd, zero_reg,
-                                            addrmode6:$Rn, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
-                          (VLD1d64Qwb_fixed VecListFourD:$Vd, zero_reg,
-                                            addrmode6:$Rn, pred:$p)>;
-//        with writeback, register stride
-defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                       (VLD1d8Qwb_register VecListFourD:$Vd, zero_reg,
-                                           addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                       (VLD1d16Qwb_register VecListFourD:$Vd, zero_reg,
-                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                       (VLD1d32Qwb_register VecListFourD:$Vd, zero_reg,
-                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
-                       (VLD1d64Qwb_register VecListFourD:$Vd, zero_reg,
-                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
-
-// VST1 requires a size suffix, but also accepts type specific variants.
-// Store one D register.
-defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1d8 addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1d16 addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1d32 addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1d64 addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-//        with writeback, fixed stride
-defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1d8wb_fixed zero_reg, addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1d16wb_fixed zero_reg, addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1d32wb_fixed zero_reg, addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1d64wb_fixed zero_reg, addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
-//        with writeback, register stride
-defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1d8wb_register  zero_reg, addrmode6:$Rn, rGPR:$Rm,
-                              VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1d16wb_register zero_reg, addrmode6:$Rn, rGPR:$Rm,
-                              VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1d32wb_register zero_reg, addrmode6:$Rn, rGPR:$Rm,
-                              VecListOneD:$Vd, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1d64wb_register zero_reg, addrmode6:$Rn, rGPR:$Rm,
-                              VecListOneD:$Vd, pred:$p)>;
-
-// Store two D registers.
-defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1q8 addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1q16 addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1q32 addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-                          (VST1q64 addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-//        with writeback, fixed stride
-defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1q8wb_fixed zero_reg, addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1q16wb_fixed zero_reg, addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1q32wb_fixed zero_reg, addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
-          (VST1q64wb_fixed zero_reg, addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
-//        with writeback, register stride
-defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1q8wb_register  zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1q16wb_register zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1q32wb_register zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
-defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
-          (VST1q64wb_register zero_reg, addrmode6:$Rn,
-                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
-
-// FIXME: The three and four register VST1 instructions haven't been moved
-// to the VecList* encoding yet, so we can't do assembly parsing support
-// for them. Uncomment these when that happens.
-// Load three D registers.
-//defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d8T addrmode6:$Rn, VecListThreeD:$Vd, pred:$p)>;
-//defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d16T addrmode6:$Rn, VecListThreeD:$Vd, pred:$p)>;
-//defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d32T addrmode6:$Rn, VecListThreeD:$Vd, pred:$p)>;
-//defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d64T addrmode6:$Rn, VecListThreeD:$Vd, pred:$p)>;
-
-// Load four D registers.
-//defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d8Q addrmode6:$Rn, VecListFourD:$Vd, pred:$p)>;
-//defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d16Q addrmode6:$Rn, VecListFourD:$Vd, pred:$p)>;
-//defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d32Q addrmode6:$Rn, VecListFourD:$Vd, pred:$p)>;
-//defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn",
-//                          (VST1d64Q addrmode6:$Rn, VecListFourD:$Vd, pred:$p)>;
-
-
-// VTRN instructions data type suffix aliases for more-specific types.
-defm : VFPDT8ReqInstAlias <"vtrn${p}", "$Dd, $Dm",
-                          (VTRNd8 DPR:$Dd, DPR:$Dm, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vtrn${p}", "$Dd, $Dm",
-                          (VTRNd16 DPR:$Dd, DPR:$Dm, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vtrn${p}", "$Dd, $Dm",
-                          (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
-
-defm : VFPDT8ReqInstAlias <"vtrn${p}", "$Qd, $Qm",
-                          (VTRNq8 QPR:$Qd, QPR:$Qm, pred:$p)>;
-defm : VFPDT16ReqInstAlias<"vtrn${p}", "$Qd, $Qm",
-                          (VTRNq16 QPR:$Qd, QPR:$Qm, pred:$p)>;
-defm : VFPDT32ReqInstAlias<"vtrn${p}", "$Qd, $Qm",
-                          (VTRNq32 QPR:$Qd, QPR:$Qm, pred:$p)>;
+// ... two-operand aliases
+def : NEONInstAlias<"vand${p} $Vdn, $Vm",
+                    (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vand${p} $Vdn, $Vm",
+                    (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vbic${p} $Vdn, $Vm",
+                    (VBICd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vbic${p} $Vdn, $Vm",
+                    (VBICq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"veor${p} $Vdn, $Vm",
+                    (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"veor${p} $Vdn, $Vm",
+                    (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vorr${p} $Vdn, $Vm",
+                    (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vorr${p} $Vdn, $Vm",
+                    (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+defm : VFPDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// VMUL two-operand aliases.
+def : NEONInstAlias<"vmul${p}.p8 $Qdn, $Qm",
+                    (VMULpq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i8 $Qdn, $Qm",
+                    (VMULv16i8 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Qm",
+                    (VMULv8i16 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Qm",
+                    (VMULv4i32 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
+
+def : NEONInstAlias<"vmul${p}.p8 $Ddn, $Dm",
+                    (VMULpd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i8 $Ddn, $Dm",
+                    (VMULv8i8 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm",
+                    (VMULv4i16 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm",
+                    (VMULv2i32 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Qm",
+                    (VMULfq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm",
+                    (VMULfd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm$lane",
+                    (VMULslv4i16 DPR:$Ddn, DPR:$Ddn, DPR_8:$Dm,
+                                 VectorIndex16:$lane, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Dm$lane",
+                    (VMULslv8i16 QPR:$Qdn, QPR:$Qdn, DPR_8:$Dm,
+                                 VectorIndex16:$lane, pred:$p)>;
+
+def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm$lane",
+                    (VMULslv2i32 DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm,
+                                 VectorIndex32:$lane, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Dm$lane",
+                    (VMULslv4i32 QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm,
+                                 VectorIndex32:$lane, pred:$p)>;
+
+def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm$lane",
+                    (VMULslfd DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm,
+                              VectorIndex32:$lane, pred:$p)>;
+def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Dm$lane",
+                    (VMULslfq QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm,
+                              VectorIndex32:$lane, pred:$p)>;
+
+// VQADD (register) two-operand aliases.
+def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm",
+                    (VQADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm",
+                    (VQADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm",
+                    (VQADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm",
+                    (VQADDsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm",
+                    (VQADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm",
+                    (VQADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm",
+                    (VQADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm",
+                    (VQADDuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm",
+                    (VQADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm",
+                    (VQADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm",
+                    (VQADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm",
+                    (VQADDsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm",
+                    (VQADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm",
+                    (VQADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm",
+                    (VQADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm",
+                    (VQADDuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// VSHL (immediate) two-operand aliases.
+def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm",
+                    (VSHLiv8i8 DPR:$Vdn, DPR:$Vdn, imm0_7:$imm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm",
+                    (VSHLiv4i16 DPR:$Vdn, DPR:$Vdn, imm0_15:$imm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm",
+                    (VSHLiv2i32 DPR:$Vdn, DPR:$Vdn, imm0_31:$imm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm",
+                    (VSHLiv1i64 DPR:$Vdn, DPR:$Vdn, imm0_63:$imm, pred:$p)>;
+
+def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm",
+                    (VSHLiv16i8 QPR:$Vdn, QPR:$Vdn, imm0_7:$imm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm",
+                    (VSHLiv8i16 QPR:$Vdn, QPR:$Vdn, imm0_15:$imm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm",
+                    (VSHLiv4i32 QPR:$Vdn, QPR:$Vdn, imm0_31:$imm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm",
+                    (VSHLiv2i64 QPR:$Vdn, QPR:$Vdn, imm0_63:$imm, pred:$p)>;
+
+// VSHL (register) two-operand aliases.
+def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm",
+                    (VSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm",
+                    (VSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm",
+                    (VSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm",
+                    (VSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm",
+                    (VSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm",
+                    (VSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm",
+                    (VSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm",
+                    (VSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm",
+                    (VSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm",
+                    (VSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm",
+                    (VSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm",
+                    (VSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm",
+                    (VSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm",
+                    (VSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm",
+                    (VSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm",
+                    (VSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// VSHL (immediate) two-operand aliases.
+def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm",
+                    (VSHRsv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm",
+                    (VSHRsv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm",
+                    (VSHRsv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm",
+                    (VSHRsv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
+
+def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm",
+                    (VSHRsv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm",
+                    (VSHRsv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm",
+                    (VSHRsv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm",
+                    (VSHRsv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
+
+def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm",
+                    (VSHRuv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm",
+                    (VSHRuv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm",
+                    (VSHRuv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm",
+                    (VSHRuv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
+
+def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm",
+                    (VSHRuv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm",
+                    (VSHRuv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm",
+                    (VSHRuv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
+def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm",
+                    (VSHRuv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
+
+// VLD1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+defm VLD1LNdAsm : NEONDT8AsmPseudoInst<"vld1${p}", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD1LNdAsm : NEONDT16AsmPseudoInst<"vld1${p}", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD1LNdAsm : NEONDT32AsmPseudoInst<"vld1${p}", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+defm VLD1LNdWB_fixed_Asm : NEONDT8AsmPseudoInst<"vld1${p}", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD1LNdWB_fixed_Asm : NEONDT16AsmPseudoInst<"vld1${p}", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD1LNdWB_fixed_Asm : NEONDT32AsmPseudoInst<"vld1${p}", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD1LNdWB_register_Asm :
+        NEONDT8AsmPseudoInst<"vld1${p}", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VLD1LNdWB_register_Asm :
+        NEONDT16AsmPseudoInst<"vld1${p}", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VLD1LNdWB_register_Asm :
+        NEONDT32AsmPseudoInst<"vld1${p}", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+defm VST1LNdAsm : NEONDT8AsmPseudoInst<"vst1${p}", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST1LNdAsm : NEONDT16AsmPseudoInst<"vst1${p}", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST1LNdAsm : NEONDT32AsmPseudoInst<"vst1${p}", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+defm VST1LNdWB_fixed_Asm : NEONDT8AsmPseudoInst<"vst1${p}", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST1LNdWB_fixed_Asm : NEONDT16AsmPseudoInst<"vst1${p}", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST1LNdWB_fixed_Asm : NEONDT32AsmPseudoInst<"vst1${p}", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST1LNdWB_register_Asm :
+        NEONDT8AsmPseudoInst<"vst1${p}", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VST1LNdWB_register_Asm :
+        NEONDT16AsmPseudoInst<"vst1${p}", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VST1LNdWB_register_Asm :
+        NEONDT32AsmPseudoInst<"vst1${p}", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+defm VLD2LNdAsm : NEONDT8AsmPseudoInst<"vld2${p}", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD2LNdAsm : NEONDT16AsmPseudoInst<"vld2${p}", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD2LNdAsm : NEONDT32AsmPseudoInst<"vld2${p}", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+defm VLD2LNdWB_fixed_Asm : NEONDT8AsmPseudoInst<"vld2${p}", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD2LNdWB_fixed_Asm : NEONDT16AsmPseudoInst<"vld2${p}", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD2LNdWB_fixed_Asm : NEONDT32AsmPseudoInst<"vld2${p}", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VLD2LNdWB_register_Asm :
+        NEONDT8AsmPseudoInst<"vld2${p}", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VLD2LNdWB_register_Asm :
+        NEONDT16AsmPseudoInst<"vld2${p}", "$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VLD2LNdWB_register_Asm :
+        NEONDT32AsmPseudoInst<"vld2${p}", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+defm VST2LNdAsm : NEONDT8AsmPseudoInst<"vst2${p}", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST2LNdAsm : NEONDT16AsmPseudoInst<"vst2${p}", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST2LNdAsm : NEONDT32AsmPseudoInst<"vst2${p}", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+defm VST2LNdWB_fixed_Asm : NEONDT8AsmPseudoInst<"vst2${p}", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST2LNdWB_fixed_Asm : NEONDT16AsmPseudoInst<"vst2${p}", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST2LNdWB_fixed_Asm : NEONDT32AsmPseudoInst<"vst2${p}", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+defm VST2LNdWB_register_Asm :
+        NEONDT8AsmPseudoInst<"vst2${p}", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VST2LNdWB_register_Asm :
+        NEONDT16AsmPseudoInst<"vst2${p}", "$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+defm VST2LNdWB_register_Asm :
+        NEONDT32AsmPseudoInst<"vst2${p}", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VMOV takes an optional datatype suffix
+defm : VFPDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+defm : VFPDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGEsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGEsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGEsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGEuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGEuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGEsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGEsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGEuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGEuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGTsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGTsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGTsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGTuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGTuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGTsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGTsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGTuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGTuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// Two-operand variants for VEXT
+def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm",
+                  (VEXTd8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_7:$imm, pred:$p)>;
+def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm",
+                  (VEXTd16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_3:$imm, pred:$p)>;
+def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm",
+                  (VEXTd32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_1:$imm, pred:$p)>;
+
+def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm",
+                  (VEXTq8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_15:$imm, pred:$p)>;
+def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm",
+                  (VEXTq16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_7:$imm, pred:$p)>;
+def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm",
+                  (VEXTq32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_3:$imm, pred:$p)>;
+def : NEONInstAlias<"vext${p}.64 $Vdn, $Vm, $imm",
+                  (VEXTq64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_1:$imm, pred:$p)>;
+
+// Two-operand variants for VQDMULH
+def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm",
+                    (VQDMULHv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm",
+                    (VQDMULHv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm",
+                    (VQDMULHv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm",
+                    (VQDMULHv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// 'gas' compatibility aliases for quad-word instructions. Strictly speaking,
+// these should restrict to just the Q register variants, but the register
+// classes are enough to match correctly regardless, so we keep it simple
+// and just use MnemonicAlias.
+def : NEONMnemonicAlias<"vbicq", "vbic">;
+def : NEONMnemonicAlias<"vandq", "vand">;
+def : NEONMnemonicAlias<"veorq", "veor">;
+def : NEONMnemonicAlias<"vorrq", "vorr">;
+
+def : NEONMnemonicAlias<"vmovq", "vmov">;
+def : NEONMnemonicAlias<"vmvnq", "vmvn">;
+// Explicit versions for floating point so that the FPImm variants get
+// handled early. The parser gets confused otherwise.
+def : NEONMnemonicAlias<"vmovq.f32", "vmov.f32">;
+def : NEONMnemonicAlias<"vmovq.f64", "vmov.f64">;
+
+def : NEONMnemonicAlias<"vaddq", "vadd">;
+def : NEONMnemonicAlias<"vsubq", "vsub">;
+
+def : NEONMnemonicAlias<"vminq", "vmin">;
+def : NEONMnemonicAlias<"vmaxq", "vmax">;
+
+def : NEONMnemonicAlias<"vmulq", "vmul">;
+
+def : NEONMnemonicAlias<"vabsq", "vabs">;
+
+def : NEONMnemonicAlias<"vshlq", "vshl">;
+def : NEONMnemonicAlias<"vshrq", "vshr">;
+
+def : NEONMnemonicAlias<"vcvtq", "vcvt">;
+
+def : NEONMnemonicAlias<"vcleq", "vcle">;
+def : NEONMnemonicAlias<"vceqq", "vceq">;
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index c6cc98d..ac1a229 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1131,9 +1131,6 @@ def tRSB :                      // A8.6.141
                "rsb", "\t$Rd, $Rn, #0",
                [(set tGPR:$Rd, (ineg tGPR:$Rn))]>;
 
-def : tInstAlias<"neg${s}${p} $Rd, $Rm",
-                 (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
-
 // Subtract with carry register
 let Uses = [CPSR] in
 def tSBC :                      // A8.6.151
@@ -1435,3 +1432,8 @@ def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
 // nothing).
 def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
 def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : tInstAlias<"neg${s}${p} $Rd, $Rm",
+                 (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
+
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 6129fa3..981592c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -80,18 +80,19 @@ def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
 // only used on aliases (Pat<> and InstAlias<>). The actual encoding
 // is handled by the destination instructions, which use t2_so_imm.
 def t2_so_imm_not_asmoperand : AsmOperandClass { let Name = "T2SOImmNot"; }
-def t2_so_imm_not : Operand<i32>,
-                    PatLeaf<(imm), [{
+def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
   return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
 }], t2_so_imm_not_XFORM> {
   let ParserMatchClass = t2_so_imm_not_asmoperand;
 }
 
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
-def t2_so_imm_neg : Operand<i32>,
-                    PatLeaf<(imm), [{
+def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
+def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
   return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1;
-}], t2_so_imm_neg_XFORM>;
+}], t2_so_imm_neg_XFORM> {
+  let ParserMatchClass = t2_so_imm_neg_asmoperand;
+}
 
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
 def imm0_4095 : Operand<i32>,
@@ -1333,7 +1334,7 @@ def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
 
 let mayStore = 1, neverHasSideEffects = 1 in {
 def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
-                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
+                            (ins GPRnopc:$Rt, t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                             "str", "\t$Rt, $addr!",
                             "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
@@ -1357,13 +1358,13 @@ def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
 } // mayStore = 1, neverHasSideEffects = 1
 
 def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
-                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                            (ins GPRnopc:$Rt, addr_offset_none:$Rn,
                                  t2am_imm8_offset:$offset),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
                           "str", "\t$Rt, $Rn$offset",
                           "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
              [(set GPRnopc:$Rn_wb,
-                  (post_store rGPR:$Rt, addr_offset_none:$Rn,
+                  (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
                               t2am_imm8_offset:$offset))]>;
 
 def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
@@ -3971,6 +3972,18 @@ def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
 def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
 def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
 
+// STMIA/STMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stm${p} $Rn!, $regs",
+                  (t2STMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMIA/LDMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"ldm${p} $Rn, $regs",
+                  (t2LDMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldm${p} $Rn!, $regs",
+                  (t2LDMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
 // STMDB/STMDB_UPD aliases w/ the optional .w suffix
 def : t2InstAlias<"stmdb${p}.w $Rn, $regs",
                   (t2STMDB GPR:$Rn, pred:$p, reglist:$regs)>;
@@ -4084,8 +4097,50 @@ def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
 // for isel.
 def : t2InstAlias<"mov${p} $Rd, $imm",
                   (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstAlias<"mvn${p} $Rd, $imm",
+                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+// Same for AND <--> BIC
+def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+// Likewise, "add Rd, t2_so_imm_neg" -> sub
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via t2_so_imm_neg
+def : t2InstAlias<"cmp${p} $Rd, $imm",
+                  (t2CMNzri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rd, $imm",
+                  (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 
 
 // Wide 'mul' encoding can be specified with only two operands.
 def : t2InstAlias<"mul${p} $Rn, $Rm",
-                  (t2MUL rGPR:$Rn, rGPR:$Rn, rGPR:$Rm, pred:$p)>;
+                  (t2MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
+                  (t2RSBri rGPR:$Rd, rGPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
+// these, unfortunately.
+def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
+                         (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
+                          (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+
+// ADR w/o the .w suffix
+def : t2InstAlias<"adr${p} $Rd, $addr",
+                  (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index e420135..5d43556 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -1160,18 +1160,64 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
 //===----------------------------------------------------------------------===//
 // Assembler aliases.
 //
+// A few mnemnoic aliases for pre-unifixed syntax. We don't guarantee to
+// support them all, but supporting at least some of the basics is
+// good to be friendly.
+def : VFP2MnemonicAlias<"flds", "vldr">;
+def : VFP2MnemonicAlias<"fldd", "vldr">;
+def : VFP2MnemonicAlias<"fmrs", "vmov">;
+def : VFP2MnemonicAlias<"fmsr", "vmov">;
+def : VFP2MnemonicAlias<"fsqrts", "vsqrt">;
+def : VFP2MnemonicAlias<"fsqrtd", "vsqrt">;
+def : VFP2MnemonicAlias<"fadds", "vadd.f32">;
+def : VFP2MnemonicAlias<"faddd", "vadd.f64">;
+def : VFP2MnemonicAlias<"fmrdd", "vmov">;
+def : VFP2MnemonicAlias<"fmrds", "vmov">;
+def : VFP2MnemonicAlias<"fmrrd", "vmov">;
+def : VFP2MnemonicAlias<"fmdrr", "vmov">;
+def : VFP2MnemonicAlias<"fmuld", "vmul.f64">;
+def : VFP2MnemonicAlias<"fnegs", "vneg.f32">;
+def : VFP2MnemonicAlias<"fnegd", "vneg.f64">;
+def : VFP2MnemonicAlias<"ftosizd", "vcvt.s32.f64">;
+def : VFP2MnemonicAlias<"ftosid", "vcvtr.s32.f64">;
+def : VFP2MnemonicAlias<"ftosizs", "vcvt.s32.f32">;
+def : VFP2MnemonicAlias<"ftosis", "vcvtr.s32.f32">;
+def : VFP2MnemonicAlias<"ftouizd", "vcvt.u32.f64">;
+def : VFP2MnemonicAlias<"ftouid", "vcvtr.u32.f64">;
+def : VFP2MnemonicAlias<"ftouizs", "vcvt.u32.f32">;
+def : VFP2MnemonicAlias<"ftouis", "vcvtr.u32.f32">;
+def : VFP2MnemonicAlias<"fsitod", "vcvt.f64.s32">;
+def : VFP2MnemonicAlias<"fsitos", "vcvt.f32.s32">;
+def : VFP2MnemonicAlias<"fuitod", "vcvt.f64.u32">;
+def : VFP2MnemonicAlias<"fuitos", "vcvt.f32.u32">;
+def : VFP2MnemonicAlias<"fsts", "vstr">;
+def : VFP2MnemonicAlias<"fstd", "vstr">;
+def : VFP2MnemonicAlias<"fmacd", "vmla.f64">;
+def : VFP2MnemonicAlias<"fmacs", "vmla.f32">;
 
 def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
+                    (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2InstAlias<"faddd${p} $Dd, $Dn, $Dm",
+                    (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
+                    (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2InstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+                    (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+
+// No need for the size suffix on VSQRT. It's implied by the register classes.
+def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
+def : VFP2InstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
 
 // VLDR/VSTR accept an optional type suffix.
-defm : VFPDT32InstAlias<"vldr${p}", "$Sd, $addr",
-                        (VLDRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
-defm : VFPDT32InstAlias<"vstr${p}", "$Sd, $addr",
-                        (VSTRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
-defm : VFPDT64InstAlias<"vldr${p}", "$Dd, $addr",
-                        (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
-defm : VFPDT64InstAlias<"vstr${p}", "$Dd, $addr",
-                        (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
+                    (VLDRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.32 $Sd, $addr",
+                    (VSTRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr",
+                    (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr",
+                    (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
 
 // VMUL has a two-operand form (implied destination operand)
 def : VFP2InstAlias<"vmul${p}.f64 $Dn, $Dm",
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c8728f4..6712fb6 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -1471,19 +1472,18 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
   while (++I != E) {
     if (I->isDebugValue() || MemOps.count(&*I))
       continue;
-    const MCInstrDesc &MCID = I->getDesc();
-    if (MCID.isCall() || MCID.isTerminator() || I->hasUnmodeledSideEffects())
+    if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
       return false;
-    if (isLd && MCID.mayStore())
+    if (isLd && I->mayStore())
       return false;
     if (!isLd) {
-      if (MCID.mayLoad())
+      if (I->mayLoad())
         return false;
       // It's not safe to move the first 'str' down.
       // str r1, [r0]
       // strh r5, [r0]
       // str r4, [r0, #+4]
-      if (MCID.mayStore())
+      if (I->mayStore())
         return false;
     }
     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
@@ -1773,8 +1773,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
   while (MBBI != E) {
     for (; MBBI != E; ++MBBI) {
       MachineInstr *MI = MBBI;
-      const MCInstrDesc &MCID = MI->getDesc();
-      if (MCID.isCall() || MCID.isTerminator()) {
+      if (MI->isCall() || MI->isTerminator()) {
         // Stop at barriers.
         ++MBBI;
         break;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 6cbb24b..61b75cb 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -38,22 +38,25 @@ extern "C" void LLVMInitializeARMTarget() {
 ///
 ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     JITInfo(),
     InstrItins(Subtarget.getInstrItineraryData()) {
   // Default to soft float ABI
-  if (FloatABIType == FloatABI::Default)
-    FloatABIType = FloatABI::Soft;
+  if (Options.FloatABIType == FloatABI::Default)
+    this->Options.FloatABIType = FloatABI::Soft;
 }
 
 ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM, OL), InstrInfo(Subtarget),
+  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "v128:32:128-v64:32:64-n32-S32") :
@@ -73,9 +76,10 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
 
 ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     InstrInfo(Subtarget.hasThumb2()
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
@@ -143,10 +147,16 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM) {
 }
 
 bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM) {
-  if (Subtarget.isThumb2() && !Subtarget.prefers32BitThumb())
-    PM.add(createThumb2SizeReductionPass());
+  if (Subtarget.isThumb2()) {
+    if (!Subtarget.prefers32BitThumb())
+      PM.add(createThumb2SizeReductionPass());
+
+    // Constant island pass work on unbundled instructions.
+    PM.add(createUnpackMachineBundlesPass());
+  }
 
   PM.add(createARMConstantIslandPass());
+
   return true;
 }
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index a1f517b..cd77822 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -41,6 +41,7 @@ private:
 public:
   ARMBaseTargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 
@@ -71,6 +72,7 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
  public:
   ARMTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS,
+                   const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
 
@@ -112,6 +114,7 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
 public:
   ThumbTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
+                     const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 19defa1..721a225 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -36,6 +36,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                  ELF::SHF_WRITE |
                                  ELF::SHF_ALLOC,
                                  SectionKind::getDataRel());
+    StructorOutputOrder = Structors::PriorityOrder;
     LSDASection = NULL;
   }
 
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index bb83e5e..cd86065 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -39,10 +39,15 @@ namespace {
 
 class ARMOperand;
 
+enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
+
 class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
+  // Map of register aliases registers via the .req directive.
+  StringMap<unsigned> RegisterReqs;
+
   struct {
     ARMCC::CondCodes Cond;    // Condition for IT block.
     unsigned Mask:4;          // Condition mask for instructions.
@@ -90,9 +95,12 @@ class ARMAsmParser : public MCTargetAsmParser {
                               unsigned &ShiftAmount);
   bool parseDirectiveWord(unsigned Size, SMLoc L);
   bool parseDirectiveThumb(SMLoc L);
+  bool parseDirectiveARM(SMLoc L);
   bool parseDirectiveThumbFunc(SMLoc L);
   bool parseDirectiveCode(SMLoc L);
   bool parseDirectiveSyntax(SMLoc L);
+  bool parseDirectiveReq(StringRef Name, SMLoc L);
+  bool parseDirectiveUnreq(SMLoc L);
 
   StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
                           bool &CarrySetting, unsigned &ProcessorIMod,
@@ -161,6 +169,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseVectorList(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index);
 
   // Asm Match Converter Methods
   bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
@@ -271,6 +280,8 @@ class ARMOperand : public MCParsedAsmOperand {
     k_DPRRegisterList,
     k_SPRRegisterList,
     k_VectorList,
+    k_VectorListAllLanes,
+    k_VectorListIndexed,
     k_ShiftedRegister,
     k_ShiftedImmediate,
     k_ShifterImmediate,
@@ -324,6 +335,8 @@ class ARMOperand : public MCParsedAsmOperand {
     struct {
       unsigned RegNum;
       unsigned Count;
+      unsigned LaneIndex;
+      bool isDoubleSpaced;
     } VectorList;
 
     struct {
@@ -409,6 +422,8 @@ public:
       Registers = o.Registers;
       break;
     case k_VectorList:
+    case k_VectorListAllLanes:
+    case k_VectorListIndexed:
       VectorList = o.VectorList;
       break;
     case k_CoprocNum:
@@ -562,6 +577,22 @@ public:
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 256;
   }
+  bool isImm0_1() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 2;
+  }
+  bool isImm0_3() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4;
+  }
   bool isImm0_7() const {
     if (Kind != k_Immediate)
       return false;
@@ -586,6 +617,94 @@ public:
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 32;
   }
+  bool isImm0_63() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 64;
+  }
+  bool isImm8() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 8;
+  }
+  bool isImm16() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 16;
+  }
+  bool isImm32() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 32;
+  }
+  bool isShrImm8() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 8;
+  }
+  bool isShrImm16() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 16;
+  }
+  bool isShrImm32() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isShrImm64() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 64;
+  }
+  bool isImm1_7() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 8;
+  }
+  bool isImm1_15() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 16;
+  }
+  bool isImm1_31() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 32;
+  }
   bool isImm1_16() const {
     if (Kind != k_Immediate)
       return false;
@@ -676,6 +795,14 @@ public:
     int64_t Value = CE->getValue();
     return ARM_AM::getSOImmVal(~Value) != -1;
   }
+  bool isARMSOImmNeg() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(-Value) != -1;
+  }
   bool isT2SOImm() const {
     if (Kind != k_Immediate)
       return false;
@@ -692,6 +819,14 @@ public:
     int64_t Value = CE->getValue();
     return ARM_AM::getT2SOImmVal(~Value) != -1;
   }
+  bool isT2SOImmNeg() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(-Value) != -1;
+  }
   bool isSetEndImm() const {
     if (Kind != k_Immediate)
       return false;
@@ -892,9 +1027,9 @@ public:
     if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [-255, -1].
-    if (!Memory.OffsetImm) return true;
+    if (!Memory.OffsetImm) return false;
     int64_t Val = Memory.OffsetImm->getValue();
-    return Val > -256 && Val < 0;
+    return (Val == INT32_MIN) || (Val > -256 && Val < 0);
   }
   bool isMemUImm12Offset() const {
     if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
@@ -940,31 +1075,75 @@ public:
   bool isProcIFlags() const { return Kind == k_ProcIFlags; }
 
   // NEON operands.
+  bool isSingleSpacedVectorList() const {
+    return Kind == k_VectorList && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorList() const {
+    return Kind == k_VectorList && VectorList.isDoubleSpaced;
+  }
   bool isVecListOneD() const {
-    if (Kind != k_VectorList) return false;
+    if (!isSingleSpacedVectorList()) return false;
     return VectorList.Count == 1;
   }
 
   bool isVecListTwoD() const {
-    if (Kind != k_VectorList) return false;
+    if (!isSingleSpacedVectorList()) return false;
     return VectorList.Count == 2;
   }
 
   bool isVecListThreeD() const {
-    if (Kind != k_VectorList) return false;
+    if (!isSingleSpacedVectorList()) return false;
     return VectorList.Count == 3;
   }
 
   bool isVecListFourD() const {
-    if (Kind != k_VectorList) return false;
+    if (!isSingleSpacedVectorList()) return false;
     return VectorList.Count == 4;
   }
 
   bool isVecListTwoQ() const {
-    if (Kind != k_VectorList) return false;
-    //FIXME: We haven't taught the parser to handle by-two register lists
-    // yet, so don't pretend to know one.
-    return VectorList.Count == 2 && false;
+    if (!isDoubleSpacedVectorList()) return false;
+    return VectorList.Count == 2;
+  }
+
+  bool isVecListOneDAllLanes() const {
+    if (Kind != k_VectorListAllLanes) return false;
+    return VectorList.Count == 1;
+  }
+
+  bool isVecListTwoDAllLanes() const {
+    if (Kind != k_VectorListAllLanes) return false;
+    return VectorList.Count == 2;
+  }
+
+  bool isVecListOneDByteIndexed() const {
+    if (Kind != k_VectorListIndexed) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListOneDHWordIndexed() const {
+    if (Kind != k_VectorListIndexed) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListOneDWordIndexed() const {
+    if (Kind != k_VectorListIndexed) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListTwoDByteIndexed() const {
+    if (Kind != k_VectorListIndexed) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListTwoDHWordIndexed() const {
+    if (Kind != k_VectorListIndexed) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListTwoDWordIndexed() const {
+    if (Kind != k_VectorListIndexed) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
   }
 
   bool isVectorIndex8() const {
@@ -1233,6 +1412,14 @@ public:
     Inst.addOperand(MCOperand::CreateImm(~CE->getValue()));
   }
 
+  void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a t2_so_imm, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
   void addARMSOImmNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually a so_imm, but we have its bitwise
@@ -1241,6 +1428,14 @@ public:
     Inst.addOperand(MCOperand::CreateImm(~CE->getValue()));
   }
 
+  void addARMSOImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a so_imm, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
   void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
@@ -1527,37 +1722,15 @@ public:
     Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags())));
   }
 
-  void addVecListOneDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
-  }
-
-  void addVecListTwoDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // Only the first register actually goes on the instruction. The rest
-    // are implied by the opcode.
-    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
-  }
-
-  void addVecListThreeDOperands(MCInst &Inst, unsigned N) const {
+  void addVecListOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    // Only the first register actually goes on the instruction. The rest
-    // are implied by the opcode.
     Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
   }
 
-  void addVecListFourDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // Only the first register actually goes on the instruction. The rest
-    // are implied by the opcode.
-    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
-  }
-
-  void addVecListTwoQOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // Only the first register actually goes on the instruction. The rest
-    // are implied by the opcode.
+  void addVecListIndexedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+    Inst.addOperand(MCOperand::CreateImm(VectorList.LaneIndex));
   }
 
   void addVectorIndex8Operands(MCInst &Inst, unsigned N) const {
@@ -1780,10 +1953,32 @@ public:
   }
 
   static ARMOperand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                      SMLoc S, SMLoc E) {
+                                      bool isDoubleSpaced, SMLoc S, SMLoc E) {
     ARMOperand *Op = new ARMOperand(k_VectorList);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateVectorListAllLanes(unsigned RegNum, unsigned Count,
+                                              SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_VectorListAllLanes);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateVectorListIndexed(unsigned RegNum, unsigned Count,
+                                             unsigned Index, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_VectorListIndexed);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.LaneIndex = Index;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -1982,6 +2177,14 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << "<vector_list " << VectorList.Count << " * "
        << VectorList.RegNum << ">";
     break;
+  case k_VectorListAllLanes:
+    OS << "<vector_list(all lanes) " << VectorList.Count << " * "
+       << VectorList.RegNum << ">";
+    break;
+  case k_VectorListIndexed:
+    OS << "<vector_list(lane " << VectorList.LaneIndex << ") "
+       << VectorList.Count << " * " << VectorList.RegNum << ">";
+    break;
   case k_Token:
     OS << "'" << getToken() << "'";
     break;
@@ -2000,7 +2203,9 @@ static unsigned MatchRegisterName(StringRef Name);
 
 bool ARMAsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
+  StartLoc = Parser.getTok().getLoc();
   RegNo = tryParseRegister();
+  EndLoc = Parser.getTok().getLoc();
 
   return (RegNo == (unsigned)-1);
 }
@@ -2013,8 +2218,6 @@ int ARMAsmParser::tryParseRegister() {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) return -1;
 
-  // FIXME: Validate register for the current architecture; we have to do
-  // validation later, so maybe there is no need for this here.
   std::string lowerCase = Tok.getString().lower();
   unsigned RegNum = MatchRegisterName(lowerCase);
   if (!RegNum) {
@@ -2023,9 +2226,34 @@ int ARMAsmParser::tryParseRegister() {
       .Case("r14", ARM::LR)
       .Case("r15", ARM::PC)
       .Case("ip", ARM::R12)
+      // Additional register name aliases for 'gas' compatibility.
+      .Case("a1", ARM::R0)
+      .Case("a2", ARM::R1)
+      .Case("a3", ARM::R2)
+      .Case("a4", ARM::R3)
+      .Case("v1", ARM::R4)
+      .Case("v2", ARM::R5)
+      .Case("v3", ARM::R6)
+      .Case("v4", ARM::R7)
+      .Case("v5", ARM::R8)
+      .Case("v6", ARM::R9)
+      .Case("v7", ARM::R10)
+      .Case("v8", ARM::R11)
+      .Case("sb", ARM::R9)
+      .Case("sl", ARM::R10)
+      .Case("fp", ARM::R11)
       .Default(0);
   }
-  if (!RegNum) return -1;
+  if (!RegNum) {
+    // Check for aliases registered via .req.
+    StringMap<unsigned>::const_iterator Entry =
+      RegisterReqs.find(Tok.getIdentifier());
+    // If no match, return failure.
+    if (Entry == RegisterReqs.end())
+      return -1;
+    Parser.Lex(); // Eat identifier token.
+    return Entry->getValue();
+  }
 
   Parser.Lex(); // Eat identifier token.
 
@@ -2045,6 +2273,7 @@ int ARMAsmParser::tryParseShiftRegister(
 
   std::string lowerCase = Tok.getString().lower();
   ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+      .Case("asl", ARM_AM::lsl)
       .Case("lsl", ARM_AM::lsl)
       .Case("lsr", ARM_AM::lsr)
       .Case("asr", ARM_AM::asr)
@@ -2073,7 +2302,8 @@ int ARMAsmParser::tryParseShiftRegister(
     ShiftReg = SrcReg;
   } else {
     // Figure out if this is shifted by a constant or a register (for non-RRX).
-    if (Parser.getTok().is(AsmToken::Hash)) {
+    if (Parser.getTok().is(AsmToken::Hash) ||
+        Parser.getTok().is(AsmToken::Dollar)) {
       Parser.Lex(); // Eat hash.
       SMLoc ImmLoc = Parser.getTok().getLoc();
       const MCExpr *ShiftExpr = 0;
@@ -2446,6 +2676,7 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Parser.Lex(); // Eat the comma.
     RegLoc = Parser.getTok().getLoc();
     int OldReg = Reg;
+    const AsmToken RegTok = Parser.getTok();
     Reg = tryParseRegister();
     if (Reg == -1)
       return Error(RegLoc, "register expected");
@@ -2459,8 +2690,13 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     if (!RC->contains(Reg))
       return Error(RegLoc, "invalid register in register list");
     // List must be monotonically increasing.
-    if (getARMRegisterNumbering(Reg) <= getARMRegisterNumbering(OldReg))
+    if (getARMRegisterNumbering(Reg) < getARMRegisterNumbering(OldReg))
       return Error(RegLoc, "register list not in ascending order");
+    if (getARMRegisterNumbering(Reg) == getARMRegisterNumbering(OldReg)) {
+      Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+              ") in register list");
+      continue;
+    }
     // VFP register lists must also be contiguous.
     // It's OK to use the enumeration values directly here rather, as the
     // VFP register classes have the enum sorted properly.
@@ -2477,13 +2713,55 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return Error(E, "'}' expected");
   Parser.Lex(); // Eat '}' token.
 
+  // Push the register list operand.
   Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
+
+  // The ARM system instruction variants for LDM/STM have a '^' token here.
+  if (Parser.getTok().is(AsmToken::Caret)) {
+    Operands.push_back(ARMOperand::CreateToken("^",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat '^' token.
+  }
+
   return false;
 }
 
+// Helper function to parse the lane index for vector lists.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index) {
+  Index = 0; // Always return a defined index value.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    Parser.Lex(); // Eat the '['.
+    if (Parser.getTok().is(AsmToken::RBrac)) {
+      // "Dn[]" is the 'all lanes' syntax.
+      LaneKind = AllLanes;
+      Parser.Lex(); // Eat the ']'.
+      return MatchOperand_Success;
+    }
+    if (Parser.getTok().is(AsmToken::Integer)) {
+      int64_t Val = Parser.getTok().getIntVal();
+      // Make this range check context sensitive for .8, .16, .32.
+      if (Val < 0 && Val > 7)
+        Error(Parser.getTok().getLoc(), "lane index out of range");
+      Index = Val;
+      LaneKind = IndexedLane;
+      Parser.Lex(); // Eat the token;
+      if (Parser.getTok().isNot(AsmToken::RBrac))
+        Error(Parser.getTok().getLoc(), "']' expected");
+      Parser.Lex(); // Eat the ']'.
+      return MatchOperand_Success;
+    }
+    Error(Parser.getTok().getLoc(), "lane index must be empty or an integer");
+    return MatchOperand_ParseFail;
+  }
+  LaneKind = NoLanes;
+  return MatchOperand_Success;
+}
+
 // parse a vector register list
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  VectorLaneTy LaneKind;
+  unsigned LaneIndex;
   SMLoc S = Parser.getTok().getLoc();
   // As an extension (to match gas), support a plain D register or Q register
   // (without encosing curly braces) as a single or double entry list,
@@ -2494,12 +2772,48 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       return MatchOperand_NoMatch;
     SMLoc E = Parser.getTok().getLoc();
     if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
-      Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, S, E));
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      default:
+        assert(0 && "unexpected lane kind!");
+      case NoLanes:
+        E = Parser.getTok().getLoc();
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E));
+        break;
+      case AllLanes:
+        E = Parser.getTok().getLoc();
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 1,
+                                                               LaneIndex, S,E));
+        break;
+      }
       return MatchOperand_Success;
     }
     if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
       Reg = getDRegFromQReg(Reg);
-      Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, S, E));
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      default:
+        assert(0 && "unexpected lane kind!");
+      case NoLanes:
+        E = Parser.getTok().getLoc();
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
+        break;
+      case AllLanes:
+        E = Parser.getTok().getLoc();
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 2, S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 2,
+                                                               LaneIndex, S,E));
+        break;
+      }
       return MatchOperand_Success;
     }
     Error(S, "vector register expected");
@@ -2518,18 +2832,30 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return MatchOperand_ParseFail;
   }
   unsigned Count = 1;
+  int Spacing = 0;
   unsigned FirstReg = Reg;
   // The list is of D registers, but we also allow Q regs and just interpret
   // them as the two D sub-registers.
   if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
     FirstReg = Reg = getDRegFromQReg(Reg);
+    Spacing = 1; // double-spacing requires explicit D registers, otherwise
+                 // it's ambiguous with four-register single spaced.
     ++Reg;
     ++Count;
   }
+  if (parseVectorLane(LaneKind, LaneIndex) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
 
   while (Parser.getTok().is(AsmToken::Comma) ||
          Parser.getTok().is(AsmToken::Minus)) {
     if (Parser.getTok().is(AsmToken::Minus)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(Parser.getTok().getLoc(),
+              "sequential registers in double spaced list");
+        return MatchOperand_ParseFail;
+      }
       Parser.Lex(); // Eat the minus.
       SMLoc EndLoc = Parser.getTok().getLoc();
       int EndReg = tryParseRegister();
@@ -2554,6 +2880,16 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
         Error(EndLoc, "bad range in register list");
         return MatchOperand_ParseFail;
       }
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      if (parseVectorLane(NextLaneKind, NextLaneIndex) != MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(EndLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
+      EndLoc = Parser.getTok().getLoc();
 
       // Add all the registers in the range to the register list.
       Count += EndReg - Reg;
@@ -2575,6 +2911,13 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     // The list is of D registers, but we also allow Q regs and just interpret
     // them as the two D sub-registers.
     if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(RegLoc,
+              "invalid register in double-spaced list (must be 'D' register')");
+        return MatchOperand_ParseFail;
+      }
       Reg = getDRegFromQReg(Reg);
       if (Reg != OldReg + 1) {
         Error(RegLoc, "non-contiguous register range");
@@ -2582,14 +2925,45 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       }
       ++Reg;
       Count += 2;
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      SMLoc EndLoc = Parser.getTok().getLoc();
+      if (parseVectorLane(NextLaneKind, NextLaneIndex) != MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(EndLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
       continue;
     }
-    // Normal D register. Just check that it's contiguous and keep going.
-    if (Reg != OldReg + 1) {
+    // Normal D register.
+    // Figure out the register spacing (single or double) of the list if
+    // we don't know it already.
+    if (!Spacing)
+      Spacing = 1 + (Reg == OldReg + 2);
+
+    // Just check that it's contiguous and keep going.
+    if (Reg != OldReg + Spacing) {
       Error(RegLoc, "non-contiguous register range");
       return MatchOperand_ParseFail;
     }
     ++Count;
+    // Parse the lane specifier if present.
+    VectorLaneTy NextLaneKind;
+    unsigned NextLaneIndex;
+    SMLoc EndLoc = Parser.getTok().getLoc();
+    if (parseVectorLane(NextLaneKind, NextLaneIndex) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+    if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+      Error(EndLoc, "mismatched lane index in register list");
+      return MatchOperand_ParseFail;
+    }
+    if (Spacing == 2 && LaneKind != NoLanes) {
+      Error(EndLoc,
+            "lane index specfier invalid in double spaced register list");
+      return MatchOperand_ParseFail;
+    }
   }
 
   SMLoc E = Parser.getTok().getLoc();
@@ -2599,7 +2973,22 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   }
   Parser.Lex(); // Eat '}' token.
 
-  Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count, S, E));
+  switch (LaneKind) {
+  default:
+    assert(0 && "unexpected lane kind in register list.");
+  case NoLanes:
+    Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count,
+                                                    (Spacing == 2), S, E));
+    break;
+  case AllLanes:
+    Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count,
+                                                            S, E));
+    break;
+  case IndexedLane:
+    Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count,
+                                                           LaneIndex, S, E));
+    break;
+  }
   return MatchOperand_Success;
 }
 
@@ -2786,7 +3175,8 @@ parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
   Parser.Lex(); // Eat shift type token.
 
   // There must be a '#' and a shift amount.
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
     Error(Parser.getTok().getLoc(), "'#' expected");
     return MatchOperand_ParseFail;
   }
@@ -2864,7 +3254,8 @@ parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   Parser.Lex(); // Eat the operator.
 
   // A '#' and a shift amount.
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
     Error(Parser.getTok().getLoc(), "'#' expected");
     return MatchOperand_ParseFail;
   }
@@ -2924,7 +3315,8 @@ parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   Parser.Lex(); // Eat the operator.
 
   // A '#' and a rotate amount.
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
     Error(Parser.getTok().getLoc(), "'#' expected");
     return MatchOperand_ParseFail;
   }
@@ -2961,7 +3353,8 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   // The bitfield descriptor is really two operands, the LSB and the width.
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
     Error(Parser.getTok().getLoc(), "'#' expected");
     return MatchOperand_ParseFail;
   }
@@ -2993,7 +3386,8 @@ parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return MatchOperand_ParseFail;
   }
   Parser.Lex(); // Eat hash token.
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
     Error(Parser.getTok().getLoc(), "'#' expected");
     return MatchOperand_ParseFail;
   }
@@ -3087,7 +3481,8 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Tok.getLoc();
 
   // Do immediates first, as we always parse those if we have a '#'.
-  if (Parser.getTok().is(AsmToken::Hash)) {
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar)) {
     Parser.Lex(); // Eat the '#'.
     // Explicitly look for a '-', as we need to encode negative zero
     // differently.
@@ -3444,7 +3839,7 @@ bool ARMAsmParser::
 cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
               const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
-  ((ARMOperand*)Operands[3])->addVecListTwoDOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   // Vn
@@ -3458,7 +3853,7 @@ bool ARMAsmParser::
 cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
                  const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
-  ((ARMOperand*)Operands[3])->addVecListTwoDOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   // Vn
@@ -3478,7 +3873,7 @@ cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
   // Vn
   ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
   // Vt
-  ((ARMOperand*)Operands[3])->addVecListTwoDOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
@@ -3494,7 +3889,7 @@ cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
   // Vm
   ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
   // Vt
-  ((ARMOperand*)Operands[3])->addVecListTwoDOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
@@ -3591,8 +3986,9 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // offset. Be friendly and also accept a plain integer (without a leading
   // hash) for gas compatibility.
   if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar) ||
       Parser.getTok().is(AsmToken::Integer)) {
-    if (Parser.getTok().is(AsmToken::Hash))
+    if (Parser.getTok().isNot(AsmToken::Integer))
       Parser.Lex(); // Eat the '#'.
     E = Parser.getTok().getLoc();
 
@@ -3690,7 +4086,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
   if (Tok.isNot(AsmToken::Identifier))
     return true;
   StringRef ShiftName = Tok.getString();
-  if (ShiftName == "lsl" || ShiftName == "LSL")
+  if (ShiftName == "lsl" || ShiftName == "LSL" ||
+      ShiftName == "asl" || ShiftName == "ASL")
     St = ARM_AM::lsl;
   else if (ShiftName == "lsr" || ShiftName == "LSR")
     St = ARM_AM::lsr;
@@ -3710,7 +4107,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
     Loc = Parser.getTok().getLoc();
     // A '#' and a shift amount.
     const AsmToken &HashTok = Parser.getTok();
-    if (HashTok.isNot(AsmToken::Hash))
+    if (HashTok.isNot(AsmToken::Hash) &&
+        HashTok.isNot(AsmToken::Dollar))
       return Error(HashTok.getLoc(), "'#' expected");
     Parser.Lex(); // Eat hash token.
 
@@ -3739,7 +4137,8 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
 
-  if (Parser.getTok().isNot(AsmToken::Hash))
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
 
   // Disambiguate the VMOV forms that can accept an FP immediate.
@@ -3852,6 +4251,7 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return parseMemory(Operands);
   case AsmToken::LCurly:
     return parseRegisterList(Operands);
+  case AsmToken::Dollar:
   case AsmToken::Hash: {
     // #42 -> immediate.
     // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
@@ -3990,7 +4390,9 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
         Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" ||
         Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" ||
         Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" ||
-        Mnemonic == "vrsqrts" || Mnemonic == "srs" ||
+        Mnemonic == "vrsqrts" || Mnemonic == "srs" || Mnemonic == "flds" ||
+        Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
+        Mnemonic == "fsts" ||
         (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
@@ -4206,9 +4608,27 @@ static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
   return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm");
 }
 
+static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features);
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Apply mnemonic aliases before doing anything else, as the destination
+  // mnemnonic may include suffices and we want to handle them normally.
+  // The generic tblgen'erated code does this later, at the start of
+  // MatchInstructionImpl(), but that's too late for aliases that include
+  // any sort of suffix.
+  unsigned AvailableFeatures = getAvailableFeatures();
+  applyMnemonicAliases(Name, AvailableFeatures);
+
+  // First check for the ARM-specific .req directive.
+  if (Parser.getTok().is(AsmToken::Identifier) &&
+      Parser.getTok().getIdentifier() == ".req") {
+    parseDirectiveReq(Name, NameLoc);
+    // We always return 'error' for this, as we're done with this
+    // statement and don't need to match the 'instruction."
+    return true;
+  }
+
   // Create the leading tokens for the mnemonic, split by '.' characters.
   size_t Start = 0, Next = Name.find('.');
   StringRef Mnemonic = Name.slice(Start, Next);
@@ -4400,12 +4820,21 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
     }
   }
   // Similarly, the Thumb1 "RSB" instruction has a literal "#0" on the
-  // end. Convert it to a token here.
+  // end. Convert it to a token here. Take care not to convert those
+  // that should hit the Thumb2 encoding.
   if (Mnemonic == "rsb" && isThumb() && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
       static_cast<ARMOperand*>(Operands[5])->isImm()) {
     ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]);
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-    if (CE && CE->getValue() == 0) {
+    if (CE && CE->getValue() == 0 &&
+        (isThumbOne() ||
+         // The cc_out operand matches the IT block.
+         ((inITBlock() != CarrySetting) &&
+         // Neither register operand is a high register.
+         (isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) &&
+          isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()))))){
       Operands.erase(Operands.begin() + 5);
       Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
       delete Op;
@@ -4605,11 +5034,495 @@ validateInstruction(MCInst &Inst,
   return false;
 }
 
+static unsigned getRealVSTLNOpcode(unsigned Opc) {
+  switch(Opc) {
+  default: assert(0 && "unexpected opcode!");
+  // VST1LN
+  case ARM::VST1LNdWB_fixed_Asm_8:  case ARM::VST1LNdWB_fixed_Asm_P8:
+  case ARM::VST1LNdWB_fixed_Asm_I8: case ARM::VST1LNdWB_fixed_Asm_S8:
+  case ARM::VST1LNdWB_fixed_Asm_U8:
+    return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_16:  case ARM::VST1LNdWB_fixed_Asm_P16:
+  case ARM::VST1LNdWB_fixed_Asm_I16: case ARM::VST1LNdWB_fixed_Asm_S16:
+  case ARM::VST1LNdWB_fixed_Asm_U16:
+    return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_32:  case ARM::VST1LNdWB_fixed_Asm_F:
+  case ARM::VST1LNdWB_fixed_Asm_F32: case ARM::VST1LNdWB_fixed_Asm_I32:
+  case ARM::VST1LNdWB_fixed_Asm_S32: case ARM::VST1LNdWB_fixed_Asm_U32:
+    return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdWB_register_Asm_8:  case ARM::VST1LNdWB_register_Asm_P8:
+  case ARM::VST1LNdWB_register_Asm_I8: case ARM::VST1LNdWB_register_Asm_S8:
+  case ARM::VST1LNdWB_register_Asm_U8:
+    return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_register_Asm_16:  case ARM::VST1LNdWB_register_Asm_P16:
+  case ARM::VST1LNdWB_register_Asm_I16: case ARM::VST1LNdWB_register_Asm_S16:
+  case ARM::VST1LNdWB_register_Asm_U16:
+    return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_register_Asm_32:  case ARM::VST1LNdWB_register_Asm_F:
+  case ARM::VST1LNdWB_register_Asm_F32: case ARM::VST1LNdWB_register_Asm_I32:
+  case ARM::VST1LNdWB_register_Asm_S32: case ARM::VST1LNdWB_register_Asm_U32:
+    return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdAsm_8:  case ARM::VST1LNdAsm_P8:
+  case ARM::VST1LNdAsm_I8: case ARM::VST1LNdAsm_S8:
+  case ARM::VST1LNdAsm_U8:
+    return ARM::VST1LNd8;
+  case ARM::VST1LNdAsm_16:  case ARM::VST1LNdAsm_P16:
+  case ARM::VST1LNdAsm_I16: case ARM::VST1LNdAsm_S16:
+  case ARM::VST1LNdAsm_U16:
+    return ARM::VST1LNd16;
+  case ARM::VST1LNdAsm_32:  case ARM::VST1LNdAsm_F:
+  case ARM::VST1LNdAsm_F32: case ARM::VST1LNdAsm_I32:
+  case ARM::VST1LNdAsm_S32: case ARM::VST1LNdAsm_U32:
+    return ARM::VST1LNd32;
+
+  // VST2LN
+  case ARM::VST2LNdWB_fixed_Asm_8:  case ARM::VST2LNdWB_fixed_Asm_P8:
+  case ARM::VST2LNdWB_fixed_Asm_I8: case ARM::VST2LNdWB_fixed_Asm_S8:
+  case ARM::VST2LNdWB_fixed_Asm_U8:
+    return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_16:  case ARM::VST2LNdWB_fixed_Asm_P16:
+  case ARM::VST2LNdWB_fixed_Asm_I16: case ARM::VST2LNdWB_fixed_Asm_S16:
+  case ARM::VST2LNdWB_fixed_Asm_U16:
+    return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_32:  case ARM::VST2LNdWB_fixed_Asm_F:
+  case ARM::VST2LNdWB_fixed_Asm_F32: case ARM::VST2LNdWB_fixed_Asm_I32:
+  case ARM::VST2LNdWB_fixed_Asm_S32: case ARM::VST2LNdWB_fixed_Asm_U32:
+    return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNdWB_register_Asm_8:  case ARM::VST2LNdWB_register_Asm_P8:
+  case ARM::VST2LNdWB_register_Asm_I8: case ARM::VST2LNdWB_register_Asm_S8:
+  case ARM::VST2LNdWB_register_Asm_U8:
+    return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_register_Asm_16:  case ARM::VST2LNdWB_register_Asm_P16:
+  case ARM::VST2LNdWB_register_Asm_I16: case ARM::VST2LNdWB_register_Asm_S16:
+  case ARM::VST2LNdWB_register_Asm_U16:
+    return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_register_Asm_32:  case ARM::VST2LNdWB_register_Asm_F:
+  case ARM::VST2LNdWB_register_Asm_F32: case ARM::VST2LNdWB_register_Asm_I32:
+  case ARM::VST2LNdWB_register_Asm_S32: case ARM::VST2LNdWB_register_Asm_U32:
+    return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNdAsm_8:  case ARM::VST2LNdAsm_P8:
+  case ARM::VST2LNdAsm_I8: case ARM::VST2LNdAsm_S8:
+  case ARM::VST2LNdAsm_U8:
+    return ARM::VST2LNd8;
+  case ARM::VST2LNdAsm_16:  case ARM::VST2LNdAsm_P16:
+  case ARM::VST2LNdAsm_I16: case ARM::VST2LNdAsm_S16:
+  case ARM::VST2LNdAsm_U16:
+    return ARM::VST2LNd16;
+  case ARM::VST2LNdAsm_32:  case ARM::VST2LNdAsm_F:
+  case ARM::VST2LNdAsm_F32: case ARM::VST2LNdAsm_I32:
+  case ARM::VST2LNdAsm_S32: case ARM::VST2LNdAsm_U32:
+    return ARM::VST2LNd32;
+  }
+}
+
+static unsigned getRealVLDLNOpcode(unsigned Opc) {
+  switch(Opc) {
+  default: assert(0 && "unexpected opcode!");
+  // VLD1LN
+  case ARM::VLD1LNdWB_fixed_Asm_8:  case ARM::VLD1LNdWB_fixed_Asm_P8:
+  case ARM::VLD1LNdWB_fixed_Asm_I8: case ARM::VLD1LNdWB_fixed_Asm_S8:
+  case ARM::VLD1LNdWB_fixed_Asm_U8:
+    return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_16:  case ARM::VLD1LNdWB_fixed_Asm_P16:
+  case ARM::VLD1LNdWB_fixed_Asm_I16: case ARM::VLD1LNdWB_fixed_Asm_S16:
+  case ARM::VLD1LNdWB_fixed_Asm_U16:
+    return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_32:  case ARM::VLD1LNdWB_fixed_Asm_F:
+  case ARM::VLD1LNdWB_fixed_Asm_F32: case ARM::VLD1LNdWB_fixed_Asm_I32:
+  case ARM::VLD1LNdWB_fixed_Asm_S32: case ARM::VLD1LNdWB_fixed_Asm_U32:
+    return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdWB_register_Asm_8:  case ARM::VLD1LNdWB_register_Asm_P8:
+  case ARM::VLD1LNdWB_register_Asm_I8: case ARM::VLD1LNdWB_register_Asm_S8:
+  case ARM::VLD1LNdWB_register_Asm_U8:
+    return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_register_Asm_16:  case ARM::VLD1LNdWB_register_Asm_P16:
+  case ARM::VLD1LNdWB_register_Asm_I16: case ARM::VLD1LNdWB_register_Asm_S16:
+  case ARM::VLD1LNdWB_register_Asm_U16:
+    return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_register_Asm_32:  case ARM::VLD1LNdWB_register_Asm_F:
+  case ARM::VLD1LNdWB_register_Asm_F32: case ARM::VLD1LNdWB_register_Asm_I32:
+  case ARM::VLD1LNdWB_register_Asm_S32: case ARM::VLD1LNdWB_register_Asm_U32:
+    return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdAsm_8:  case ARM::VLD1LNdAsm_P8:
+  case ARM::VLD1LNdAsm_I8: case ARM::VLD1LNdAsm_S8:
+  case ARM::VLD1LNdAsm_U8:
+    return ARM::VLD1LNd8;
+  case ARM::VLD1LNdAsm_16:  case ARM::VLD1LNdAsm_P16:
+  case ARM::VLD1LNdAsm_I16: case ARM::VLD1LNdAsm_S16:
+  case ARM::VLD1LNdAsm_U16:
+    return ARM::VLD1LNd16;
+  case ARM::VLD1LNdAsm_32:  case ARM::VLD1LNdAsm_F:
+  case ARM::VLD1LNdAsm_F32: case ARM::VLD1LNdAsm_I32:
+  case ARM::VLD1LNdAsm_S32: case ARM::VLD1LNdAsm_U32:
+    return ARM::VLD1LNd32;
+
+  // VLD2LN
+  case ARM::VLD2LNdWB_fixed_Asm_8:  case ARM::VLD2LNdWB_fixed_Asm_P8:
+  case ARM::VLD2LNdWB_fixed_Asm_I8: case ARM::VLD2LNdWB_fixed_Asm_S8:
+  case ARM::VLD2LNdWB_fixed_Asm_U8:
+    return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_16:  case ARM::VLD2LNdWB_fixed_Asm_P16:
+  case ARM::VLD2LNdWB_fixed_Asm_I16: case ARM::VLD2LNdWB_fixed_Asm_S16:
+  case ARM::VLD2LNdWB_fixed_Asm_U16:
+    return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_32:  case ARM::VLD2LNdWB_fixed_Asm_F:
+  case ARM::VLD2LNdWB_fixed_Asm_F32: case ARM::VLD2LNdWB_fixed_Asm_I32:
+  case ARM::VLD2LNdWB_fixed_Asm_S32: case ARM::VLD2LNdWB_fixed_Asm_U32:
+    return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNdWB_register_Asm_8:  case ARM::VLD2LNdWB_register_Asm_P8:
+  case ARM::VLD2LNdWB_register_Asm_I8: case ARM::VLD2LNdWB_register_Asm_S8:
+  case ARM::VLD2LNdWB_register_Asm_U8:
+    return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_register_Asm_16:  case ARM::VLD2LNdWB_register_Asm_P16:
+  case ARM::VLD2LNdWB_register_Asm_I16: case ARM::VLD2LNdWB_register_Asm_S16:
+  case ARM::VLD2LNdWB_register_Asm_U16:
+    return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_register_Asm_32:  case ARM::VLD2LNdWB_register_Asm_F:
+  case ARM::VLD2LNdWB_register_Asm_F32: case ARM::VLD2LNdWB_register_Asm_I32:
+  case ARM::VLD2LNdWB_register_Asm_S32: case ARM::VLD2LNdWB_register_Asm_U32:
+    return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNdAsm_8:  case ARM::VLD2LNdAsm_P8:
+  case ARM::VLD2LNdAsm_I8: case ARM::VLD2LNdAsm_S8:
+  case ARM::VLD2LNdAsm_U8:
+    return ARM::VLD2LNd8;
+  case ARM::VLD2LNdAsm_16:  case ARM::VLD2LNdAsm_P16:
+  case ARM::VLD2LNdAsm_I16: case ARM::VLD2LNdAsm_S16:
+  case ARM::VLD2LNdAsm_U16:
+    return ARM::VLD2LNd16;
+  case ARM::VLD2LNdAsm_32:  case ARM::VLD2LNdAsm_F:
+  case ARM::VLD2LNdAsm_F32: case ARM::VLD2LNdAsm_I32:
+  case ARM::VLD2LNdAsm_S32: case ARM::VLD2LNdAsm_U32:
+    return ARM::VLD2LNd32;
+  }
+}
+
 bool ARMAsmParser::
 processInstruction(MCInst &Inst,
                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   switch (Inst.getOpcode()) {
-  // Handle the MOV complex aliases.
+  // Handle NEON VST complex aliases.
+  case ARM::VST1LNdWB_register_Asm_8: case ARM::VST1LNdWB_register_Asm_P8:
+  case ARM::VST1LNdWB_register_Asm_I8: case ARM::VST1LNdWB_register_Asm_S8:
+  case ARM::VST1LNdWB_register_Asm_U8: case ARM::VST1LNdWB_register_Asm_16:
+  case ARM::VST1LNdWB_register_Asm_P16: case ARM::VST1LNdWB_register_Asm_I16:
+  case ARM::VST1LNdWB_register_Asm_S16: case ARM::VST1LNdWB_register_Asm_U16:
+  case ARM::VST1LNdWB_register_Asm_32: case ARM::VST1LNdWB_register_Asm_F:
+  case ARM::VST1LNdWB_register_Asm_F32: case ARM::VST1LNdWB_register_Asm_I32:
+  case ARM::VST1LNdWB_register_Asm_S32: case ARM::VST1LNdWB_register_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVSTLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_register_Asm_8: case ARM::VST2LNdWB_register_Asm_P8:
+  case ARM::VST2LNdWB_register_Asm_I8: case ARM::VST2LNdWB_register_Asm_S8:
+  case ARM::VST2LNdWB_register_Asm_U8: case ARM::VST2LNdWB_register_Asm_16:
+  case ARM::VST2LNdWB_register_Asm_P16: case ARM::VST2LNdWB_register_Asm_I16:
+  case ARM::VST2LNdWB_register_Asm_S16: case ARM::VST2LNdWB_register_Asm_U16:
+  case ARM::VST2LNdWB_register_Asm_32: case ARM::VST2LNdWB_register_Asm_F:
+  case ARM::VST2LNdWB_register_Asm_F32: case ARM::VST2LNdWB_register_Asm_I32:
+  case ARM::VST2LNdWB_register_Asm_S32: case ARM::VST2LNdWB_register_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVSTLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::VST1LNdWB_fixed_Asm_8: case ARM::VST1LNdWB_fixed_Asm_P8:
+  case ARM::VST1LNdWB_fixed_Asm_I8: case ARM::VST1LNdWB_fixed_Asm_S8:
+  case ARM::VST1LNdWB_fixed_Asm_U8: case ARM::VST1LNdWB_fixed_Asm_16:
+  case ARM::VST1LNdWB_fixed_Asm_P16: case ARM::VST1LNdWB_fixed_Asm_I16:
+  case ARM::VST1LNdWB_fixed_Asm_S16: case ARM::VST1LNdWB_fixed_Asm_U16:
+  case ARM::VST1LNdWB_fixed_Asm_32: case ARM::VST1LNdWB_fixed_Asm_F:
+  case ARM::VST1LNdWB_fixed_Asm_F32: case ARM::VST1LNdWB_fixed_Asm_I32:
+  case ARM::VST1LNdWB_fixed_Asm_S32: case ARM::VST1LNdWB_fixed_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVSTLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_fixed_Asm_8: case ARM::VST2LNdWB_fixed_Asm_P8:
+  case ARM::VST2LNdWB_fixed_Asm_I8: case ARM::VST2LNdWB_fixed_Asm_S8:
+  case ARM::VST2LNdWB_fixed_Asm_U8: case ARM::VST2LNdWB_fixed_Asm_16:
+  case ARM::VST2LNdWB_fixed_Asm_P16: case ARM::VST2LNdWB_fixed_Asm_I16:
+  case ARM::VST2LNdWB_fixed_Asm_S16: case ARM::VST2LNdWB_fixed_Asm_U16:
+  case ARM::VST2LNdWB_fixed_Asm_32: case ARM::VST2LNdWB_fixed_Asm_F:
+  case ARM::VST2LNdWB_fixed_Asm_F32: case ARM::VST2LNdWB_fixed_Asm_I32:
+  case ARM::VST2LNdWB_fixed_Asm_S32: case ARM::VST2LNdWB_fixed_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVSTLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::VST1LNdAsm_8: case ARM::VST1LNdAsm_P8: case ARM::VST1LNdAsm_I8:
+  case ARM::VST1LNdAsm_S8: case ARM::VST1LNdAsm_U8: case ARM::VST1LNdAsm_16:
+  case ARM::VST1LNdAsm_P16: case ARM::VST1LNdAsm_I16: case ARM::VST1LNdAsm_S16:
+  case ARM::VST1LNdAsm_U16: case ARM::VST1LNdAsm_32: case ARM::VST1LNdAsm_F:
+  case ARM::VST1LNdAsm_F32: case ARM::VST1LNdAsm_I32: case ARM::VST1LNdAsm_S32:
+  case ARM::VST1LNdAsm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVSTLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdAsm_8: case ARM::VST2LNdAsm_P8: case ARM::VST2LNdAsm_I8:
+  case ARM::VST2LNdAsm_S8: case ARM::VST2LNdAsm_U8: case ARM::VST2LNdAsm_16:
+  case ARM::VST2LNdAsm_P16: case ARM::VST2LNdAsm_I16: case ARM::VST2LNdAsm_S16:
+  case ARM::VST2LNdAsm_U16: case ARM::VST2LNdAsm_32: case ARM::VST2LNdAsm_F:
+  case ARM::VST2LNdAsm_F32: case ARM::VST2LNdAsm_I32: case ARM::VST2LNdAsm_S32:
+  case ARM::VST2LNdAsm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVSTLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+  // Handle NEON VLD complex aliases.
+  case ARM::VLD1LNdWB_register_Asm_8: case ARM::VLD1LNdWB_register_Asm_P8:
+  case ARM::VLD1LNdWB_register_Asm_I8: case ARM::VLD1LNdWB_register_Asm_S8:
+  case ARM::VLD1LNdWB_register_Asm_U8: case ARM::VLD1LNdWB_register_Asm_16:
+  case ARM::VLD1LNdWB_register_Asm_P16: case ARM::VLD1LNdWB_register_Asm_I16:
+  case ARM::VLD1LNdWB_register_Asm_S16: case ARM::VLD1LNdWB_register_Asm_U16:
+  case ARM::VLD1LNdWB_register_Asm_32: case ARM::VLD1LNdWB_register_Asm_F:
+  case ARM::VLD1LNdWB_register_Asm_F32: case ARM::VLD1LNdWB_register_Asm_I32:
+  case ARM::VLD1LNdWB_register_Asm_S32: case ARM::VLD1LNdWB_register_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVLDLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_register_Asm_8: case ARM::VLD2LNdWB_register_Asm_P8:
+  case ARM::VLD2LNdWB_register_Asm_I8: case ARM::VLD2LNdWB_register_Asm_S8:
+  case ARM::VLD2LNdWB_register_Asm_U8: case ARM::VLD2LNdWB_register_Asm_16:
+  case ARM::VLD2LNdWB_register_Asm_P16: case ARM::VLD2LNdWB_register_Asm_I16:
+  case ARM::VLD2LNdWB_register_Asm_S16: case ARM::VLD2LNdWB_register_Asm_U16:
+  case ARM::VLD2LNdWB_register_Asm_32: case ARM::VLD2LNdWB_register_Asm_F:
+  case ARM::VLD2LNdWB_register_Asm_F32: case ARM::VLD2LNdWB_register_Asm_I32:
+  case ARM::VLD2LNdWB_register_Asm_S32: case ARM::VLD2LNdWB_register_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVLDLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdWB_fixed_Asm_8: case ARM::VLD1LNdWB_fixed_Asm_P8:
+  case ARM::VLD1LNdWB_fixed_Asm_I8: case ARM::VLD1LNdWB_fixed_Asm_S8:
+  case ARM::VLD1LNdWB_fixed_Asm_U8: case ARM::VLD1LNdWB_fixed_Asm_16:
+  case ARM::VLD1LNdWB_fixed_Asm_P16: case ARM::VLD1LNdWB_fixed_Asm_I16:
+  case ARM::VLD1LNdWB_fixed_Asm_S16: case ARM::VLD1LNdWB_fixed_Asm_U16:
+  case ARM::VLD1LNdWB_fixed_Asm_32: case ARM::VLD1LNdWB_fixed_Asm_F:
+  case ARM::VLD1LNdWB_fixed_Asm_F32: case ARM::VLD1LNdWB_fixed_Asm_I32:
+  case ARM::VLD1LNdWB_fixed_Asm_S32: case ARM::VLD1LNdWB_fixed_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVLDLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_fixed_Asm_8: case ARM::VLD2LNdWB_fixed_Asm_P8:
+  case ARM::VLD2LNdWB_fixed_Asm_I8: case ARM::VLD2LNdWB_fixed_Asm_S8:
+  case ARM::VLD2LNdWB_fixed_Asm_U8: case ARM::VLD2LNdWB_fixed_Asm_16:
+  case ARM::VLD2LNdWB_fixed_Asm_P16: case ARM::VLD2LNdWB_fixed_Asm_I16:
+  case ARM::VLD2LNdWB_fixed_Asm_S16: case ARM::VLD2LNdWB_fixed_Asm_U16:
+  case ARM::VLD2LNdWB_fixed_Asm_32: case ARM::VLD2LNdWB_fixed_Asm_F:
+  case ARM::VLD2LNdWB_fixed_Asm_F32: case ARM::VLD2LNdWB_fixed_Asm_I32:
+  case ARM::VLD2LNdWB_fixed_Asm_S32: case ARM::VLD2LNdWB_fixed_Asm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVLDLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdAsm_8: case ARM::VLD1LNdAsm_P8: case ARM::VLD1LNdAsm_I8:
+  case ARM::VLD1LNdAsm_S8: case ARM::VLD1LNdAsm_U8: case ARM::VLD1LNdAsm_16:
+  case ARM::VLD1LNdAsm_P16: case ARM::VLD1LNdAsm_I16: case ARM::VLD1LNdAsm_S16:
+  case ARM::VLD1LNdAsm_U16: case ARM::VLD1LNdAsm_32: case ARM::VLD1LNdAsm_F:
+  case ARM::VLD1LNdAsm_F32: case ARM::VLD1LNdAsm_I32: case ARM::VLD1LNdAsm_S32:
+  case ARM::VLD1LNdAsm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVLDLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdAsm_8: case ARM::VLD2LNdAsm_P8: case ARM::VLD2LNdAsm_I8:
+  case ARM::VLD2LNdAsm_S8: case ARM::VLD2LNdAsm_U8: case ARM::VLD2LNdAsm_16:
+  case ARM::VLD2LNdAsm_P16: case ARM::VLD2LNdAsm_I16: case ARM::VLD2LNdAsm_S16:
+  case ARM::VLD2LNdAsm_U16: case ARM::VLD2LNdAsm_32: case ARM::VLD2LNdAsm_F:
+  case ARM::VLD2LNdAsm_F32: case ARM::VLD2LNdAsm_I32: case ARM::VLD2LNdAsm_S32:
+  case ARM::VLD2LNdAsm_U32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    TmpInst.setOpcode(getRealVLDLNOpcode(Inst.getOpcode()));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg()+1));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+  // Handle the Thumb2 mode MOV complex aliases.
+  case ARM::t2MOVsi:
+  case ARM::t2MOVSsi: {
+    // Which instruction to expand to depends on the CCOut operand and
+    // whether we're in an IT block if the register operands are low
+    // registers.
+    bool isNarrow = false;
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+      isNarrow = true;
+    MCInst TmpInst;
+    unsigned newOpc;
+    switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+    case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+    }
+    unsigned Ammount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    if (Ammount == 32) Ammount = 0;
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    if (isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(Ammount));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    if (!isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    Inst = TmpInst;
+    return true;
+  }
+  // Handle the ARM mode MOV complex aliases.
   case ARM::ASRr:
   case ARM::LSRr:
   case ARM::LSLr:
@@ -4743,6 +5656,24 @@ processInstruction(MCInst &Inst,
       Inst = TmpInst;
     }
     break;
+  case ARM::t2ADDri12:
+    // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
+    // mnemonic was used (not "addw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "add" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2ADDri);
+    Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+    break;
+  case ARM::t2SUBri12:
+    // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
+    // mnemonic was used (not "subw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "sub" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2SUBri);
+    Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+    break;
   case ARM::tADDi8:
     // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
     // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
@@ -4763,6 +5694,26 @@ processInstruction(MCInst &Inst,
       return true;
     }
     break;
+  case ARM::t2ADDrr: {
+    // If the destination and first source operand are the same, and
+    // there's no setting of the flags, use encoding T2 instead of T3.
+    // Note that this is only for ADD, not SUB. This mirrors the system
+    // 'as' behaviour. Make sure the wide encoding wasn't explicit.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+        Inst.getOperand(5).getReg() != 0 ||
+        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::tADDhirr);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
   case ARM::tB:
     // A Thumb conditional branch outside of an IT block is a tBcc.
     if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) {
@@ -5079,12 +6030,16 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveWord(4, DirectiveID.getLoc());
   else if (IDVal == ".thumb")
     return parseDirectiveThumb(DirectiveID.getLoc());
+  else if (IDVal == ".arm")
+    return parseDirectiveARM(DirectiveID.getLoc());
   else if (IDVal == ".thumb_func")
     return parseDirectiveThumbFunc(DirectiveID.getLoc());
   else if (IDVal == ".code")
     return parseDirectiveCode(DirectiveID.getLoc());
   else if (IDVal == ".syntax")
     return parseDirectiveSyntax(DirectiveID.getLoc());
+  else if (IDVal == ".unreq")
+    return parseDirectiveUnreq(DirectiveID.getLoc());
   return true;
 }
 
@@ -5120,9 +6075,22 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
-  // TODO: set thumb mode
-  // TODO: tell the MC streamer the mode
-  // getParser().getStreamer().Emit???();
+  if (!isThumb())
+    SwitchMode();
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  return false;
+}
+
+/// parseDirectiveARM
+///  ::= .arm
+bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  Parser.Lex();
+
+  if (isThumb())
+    SwitchMode();
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
   return false;
 }
 
@@ -5212,6 +6180,45 @@ bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
   return false;
 }
 
+/// parseDirectiveReq
+///  ::= name .req registername
+bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  Parser.Lex(); // Eat the '.req' token.
+  unsigned Reg;
+  SMLoc SRegLoc, ERegLoc;
+  if (ParseRegister(Reg, SRegLoc, ERegLoc)) {
+    Parser.EatToEndOfStatement();
+    return Error(SRegLoc, "register name expected");
+  }
+
+  // Shouldn't be anything else.
+  if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+    Parser.EatToEndOfStatement();
+    return Error(Parser.getTok().getLoc(),
+                 "unexpected input in .req directive.");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+
+  if (RegisterReqs.GetOrCreateValue(Name, Reg).getValue() != Reg)
+    return Error(SRegLoc, "redefinition of '" + Name +
+                          "' does not match original.");
+
+  return false;
+}
+
+/// parseDirectiveUneq
+///  ::= .unreq registername
+bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Parser.EatToEndOfStatement();
+    return Error(L, "unexpected input in .unreq directive.");
+  }
+  RegisterReqs.erase(Parser.getTok().getIdentifier());
+  Parser.Lex(); // Eat the identifier.
+  return false;
+}
+
 extern "C" void LLVMInitializeARMAsmLexer();
 
 /// Force static initialization.
diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt
index 3f5ad39..e24a1b1 100644
--- a/lib/Target/ARM/AsmParser/CMakeLists.txt
+++ b/lib/Target/ARM/AsmParser/CMakeLists.txt
@@ -6,11 +6,3 @@ add_llvm_library(LLVMARMAsmParser
   )
 
 add_dependencies(LLVMARMAsmParser ARMCommonTableGen)
-
-add_llvm_library_dependencies(LLVMARMAsmParser
-  LLVMARMDesc
-  LLVMARMInfo
-  LLVMMC
-  LLVMMCParser
-  LLVMSupport
-  )
diff --git a/lib/Target/ARM/AsmParser/LLVMBuild.txt b/lib/Target/ARM/AsmParser/LLVMBuild.txt
index cbf9b4b..f0184b6 100644
--- a/lib/Target/ARM/AsmParser/LLVMBuild.txt
+++ b/lib/Target/ARM/AsmParser/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = ARMAsmParser
 parent = ARM
 required_libraries = ARMDesc ARMInfo MC MCParser Support
 add_to_library_groups = ARM
-
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 511932e..04cdf55 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -48,20 +48,6 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
-add_llvm_library_dependencies(LLVMARMCodeGen
-  LLVMARMAsmPrinter
-  LLVMARMDesc
-  LLVMARMInfo
-  LLVMAnalysis
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  )
-
 # workaround for hanging compilation on MSVC9, 10
 if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 )
 set_property(
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ad250ab..49c64fd 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -2085,15 +2085,24 @@ static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn,
     case ARM::VLD1d32Qwb_register:
     case ARM::VLD1d64Qwb_fixed:
     case ARM::VLD1d64Qwb_register:
-    case ARM::VLD2d8_UPD:
-    case ARM::VLD2d16_UPD:
-    case ARM::VLD2d32_UPD:
-    case ARM::VLD2q8_UPD:
-    case ARM::VLD2q16_UPD:
-    case ARM::VLD2q32_UPD:
-    case ARM::VLD2b8_UPD:
-    case ARM::VLD2b16_UPD:
-    case ARM::VLD2b32_UPD:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
+    case ARM::VLD2b8wb_fixed:
+    case ARM::VLD2b16wb_fixed:
+    case ARM::VLD2b32wb_fixed:
+    case ARM::VLD2b8wb_register:
+    case ARM::VLD2b16wb_register:
+    case ARM::VLD2b32wb_register:
     case ARM::VLD3d8_UPD:
     case ARM::VLD3d16_UPD:
     case ARM::VLD3d32_UPD:
@@ -2196,23 +2205,40 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
     case ARM::VST1q16wb_register:
     case ARM::VST1q32wb_register:
     case ARM::VST1q64wb_register:
-    case ARM::VST1d8T_UPD:
-    case ARM::VST1d16T_UPD:
-    case ARM::VST1d32T_UPD:
-    case ARM::VST1d64T_UPD:
-    case ARM::VST1d8Q_UPD:
-    case ARM::VST1d16Q_UPD:
-    case ARM::VST1d32Q_UPD:
-    case ARM::VST1d64Q_UPD:
-    case ARM::VST2d8_UPD:
-    case ARM::VST2d16_UPD:
-    case ARM::VST2d32_UPD:
-    case ARM::VST2q8_UPD:
-    case ARM::VST2q16_UPD:
-    case ARM::VST2q32_UPD:
-    case ARM::VST2b8_UPD:
-    case ARM::VST2b16_UPD:
-    case ARM::VST2b32_UPD:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Twb_register:
+    case ARM::VST1d16Twb_register:
+    case ARM::VST1d32Twb_register:
+    case ARM::VST1d64Twb_register:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST1d8Qwb_register:
+    case ARM::VST1d16Qwb_register:
+    case ARM::VST1d32Qwb_register:
+    case ARM::VST1d64Qwb_register:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2d8wb_register:
+    case ARM::VST2d16wb_register:
+    case ARM::VST2d32wb_register:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2q8wb_register:
+    case ARM::VST2q16wb_register:
+    case ARM::VST2q32wb_register:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
+    case ARM::VST2b8wb_register:
+    case ARM::VST2b16wb_register:
+    case ARM::VST2b32wb_register:
     case ARM::VST3d8_UPD:
     case ARM::VST3d16_UPD:
     case ARM::VST3d32_UPD:
@@ -2264,34 +2290,6 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
 
   // Second input register
   switch (Inst.getOpcode()) {
-    case ARM::VST1d8T:
-    case ARM::VST1d16T:
-    case ARM::VST1d32T:
-    case ARM::VST1d64T:
-    case ARM::VST1d8T_UPD:
-    case ARM::VST1d16T_UPD:
-    case ARM::VST1d32T_UPD:
-    case ARM::VST1d64T_UPD:
-    case ARM::VST1d8Q:
-    case ARM::VST1d16Q:
-    case ARM::VST1d32Q:
-    case ARM::VST1d64Q:
-    case ARM::VST1d8Q_UPD:
-    case ARM::VST1d16Q_UPD:
-    case ARM::VST1d32Q_UPD:
-    case ARM::VST1d64Q_UPD:
-    case ARM::VST2d8:
-    case ARM::VST2d16:
-    case ARM::VST2d32:
-    case ARM::VST2d8_UPD:
-    case ARM::VST2d16_UPD:
-    case ARM::VST2d32_UPD:
-    case ARM::VST2q8:
-    case ARM::VST2q16:
-    case ARM::VST2q32:
-    case ARM::VST2q8_UPD:
-    case ARM::VST2q16_UPD:
-    case ARM::VST2q32_UPD:
     case ARM::VST3d8:
     case ARM::VST3d16:
     case ARM::VST3d32:
@@ -2307,12 +2305,6 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
       if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
         return MCDisassembler::Fail;
       break;
-    case ARM::VST2b8:
-    case ARM::VST2b16:
-    case ARM::VST2b32:
-    case ARM::VST2b8_UPD:
-    case ARM::VST2b16_UPD:
-    case ARM::VST2b32_UPD:
     case ARM::VST3q8:
     case ARM::VST3q16:
     case ARM::VST3q32:
@@ -2334,28 +2326,6 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
 
   // Third input register
   switch (Inst.getOpcode()) {
-    case ARM::VST1d8T:
-    case ARM::VST1d16T:
-    case ARM::VST1d32T:
-    case ARM::VST1d64T:
-    case ARM::VST1d8T_UPD:
-    case ARM::VST1d16T_UPD:
-    case ARM::VST1d32T_UPD:
-    case ARM::VST1d64T_UPD:
-    case ARM::VST1d8Q:
-    case ARM::VST1d16Q:
-    case ARM::VST1d32Q:
-    case ARM::VST1d64Q:
-    case ARM::VST1d8Q_UPD:
-    case ARM::VST1d16Q_UPD:
-    case ARM::VST1d32Q_UPD:
-    case ARM::VST1d64Q_UPD:
-    case ARM::VST2q8:
-    case ARM::VST2q16:
-    case ARM::VST2q32:
-    case ARM::VST2q8_UPD:
-    case ARM::VST2q16_UPD:
-    case ARM::VST2q32_UPD:
     case ARM::VST3d8:
     case ARM::VST3d16:
     case ARM::VST3d32:
@@ -2392,20 +2362,6 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
 
   // Fourth input register
   switch (Inst.getOpcode()) {
-    case ARM::VST1d8Q:
-    case ARM::VST1d16Q:
-    case ARM::VST1d32Q:
-    case ARM::VST1d64Q:
-    case ARM::VST1d8Q_UPD:
-    case ARM::VST1d16Q_UPD:
-    case ARM::VST1d32Q_UPD:
-    case ARM::VST1d64Q_UPD:
-    case ARM::VST2q8:
-    case ARM::VST2q16:
-    case ARM::VST2q32:
-    case ARM::VST2q8_UPD:
-    case ARM::VST2q16_UPD:
-    case ARM::VST2q32_UPD:
     case ARM::VST4d8:
     case ARM::VST4d16:
     case ARM::VST4d32:
@@ -2441,16 +2397,11 @@ static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
   unsigned align = fieldFromInstruction32(Insn, 4, 1);
   unsigned size = fieldFromInstruction32(Insn, 6, 2);
-  unsigned regs = fieldFromInstruction32(Insn, 5, 1) + 1;
 
   align *= (1 << size);
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
     return MCDisassembler::Fail;
-  if (regs == 2) {
-    if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
-      return MCDisassembler::Fail;
-  }
   if (Rm != 0xF) {
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -2460,12 +2411,12 @@ static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(align));
 
-  if (Rm == 0xD)
-    Inst.addOperand(MCOperand::CreateReg(0));
-  else if (Rm != 0xF) {
-    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
-      return MCDisassembler::Fail;
-  }
+  // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+  // variant encodes Rm == 0xf. Anything else is a register offset post-
+  // increment and we need to add the register operand to the instruction.
+  if (Rm != 0xD && Rm != 0xF &&
+      !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
 
   return S;
 }
@@ -2693,7 +2644,6 @@ static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
   Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
   unsigned op = fieldFromInstruction32(Insn, 6, 1);
-  unsigned length = fieldFromInstruction32(Insn, 8, 2) + 1;
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -2702,10 +2652,8 @@ static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
     return MCDisassembler::Fail; // Writeback
   }
 
-  for (unsigned i = 0; i < length; ++i) {
-    if (!Check(S, DecodeDPRRegisterClass(Inst, (Rn+i)%32, Address, Decoder)))
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
-  }
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -4138,4 +4086,3 @@ static DecodeStatus DecodeVCVTQ(llvm::MCInst &Inst, unsigned Insn,
 
   return S;
 }
-
diff --git a/lib/Target/ARM/Disassembler/CMakeLists.txt b/lib/Target/ARM/Disassembler/CMakeLists.txt
index da87751..9de6e5c 100644
--- a/lib/Target/ARM/Disassembler/CMakeLists.txt
+++ b/lib/Target/ARM/Disassembler/CMakeLists.txt
@@ -11,11 +11,3 @@ set_property(
   )
 endif()
 add_dependencies(LLVMARMDisassembler ARMCommonTableGen)
-
-add_llvm_library_dependencies(LLVMARMDisassembler
-  LLVMARMCodeGen
-  LLVMARMDesc
-  LLVMARMInfo
-  LLVMMC
-  LLVMSupport
-  )
diff --git a/lib/Target/ARM/Disassembler/LLVMBuild.txt b/lib/Target/ARM/Disassembler/LLVMBuild.txt
index baa9bc3..94075a9 100644
--- a/lib/Target/ARM/Disassembler/LLVMBuild.txt
+++ b/lib/Target/ARM/Disassembler/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = ARMDisassembler
 parent = ARM
 required_libraries = ARMCodeGen ARMDesc ARMInfo MC Support
 add_to_library_groups = ARM
-
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 6c6c021..662097a 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -1029,3 +1029,29 @@ void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum,
     << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << ", "
     << getRegisterName(MI->getOperand(OpNum).getReg() + 3) << "}";
 }
+
+void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
+    << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
+    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "}";
+}
+
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 3f38f1a..05db2d2 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -133,6 +133,12 @@ public:
   void printVectorListTwo(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVectorListThree(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVectorListFour(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/InstPrinter/CMakeLists.txt b/lib/Target/ARM/InstPrinter/CMakeLists.txt
index fa0b495..e2d4819 100644
--- a/lib/Target/ARM/InstPrinter/CMakeLists.txt
+++ b/lib/Target/ARM/InstPrinter/CMakeLists.txt
@@ -5,8 +5,3 @@ add_llvm_library(LLVMARMAsmPrinter
   )
 
 add_dependencies(LLVMARMAsmPrinter ARMCommonTableGen)
-
-add_llvm_library_dependencies(LLVMARMAsmPrinter
-  LLVMMC
-  LLVMSupport
-  )
diff --git a/lib/Target/ARM/InstPrinter/LLVMBuild.txt b/lib/Target/ARM/InstPrinter/LLVMBuild.txt
index b34aab4..6f4fa36 100644
--- a/lib/Target/ARM/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/ARM/InstPrinter/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = ARMAsmPrinter
 parent = ARM
 required_libraries = MC Support
 add_to_library_groups = ARM
-
diff --git a/lib/Target/ARM/LLVMBuild.txt b/lib/Target/ARM/LLVMBuild.txt
index 9082539..fd4b3a3 100644
--- a/lib/Target/ARM/LLVMBuild.txt
+++ b/lib/Target/ARM/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = ARM
@@ -30,4 +33,3 @@ name = ARMCodeGen
 parent = ARM
 required_libraries = ARMAsmPrinter ARMDesc ARMInfo Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target
 add_to_library_groups = ARM
-
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 62d04c4..bf1f0e8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -102,6 +102,11 @@ public:
 
   bool MayNeedRelaxation(const MCInst &Inst) const;
 
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCInstFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
   void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
 
   bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
@@ -124,14 +129,49 @@ public:
 };
 } // end anonymous namespace
 
+static unsigned getRelaxedOpcode(unsigned Op) {
+  switch (Op) {
+  default: return Op;
+  case ARM::tBcc: return ARM::t2Bcc;
+  }
+}
+
 bool ARMAsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
-  // FIXME: Thumb targets, different move constant targets..
+  if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
   return false;
 }
 
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCInstFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
+  // low bit being an implied zero. There's an implied +4 offset for the
+  // branch, so we adjust the other way here to determine what's
+  // encodable.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  int64_t Offset = int64_t(Value) - 4;
+  return Offset > 254 || Offset < -256;
+}
+
 void ARMAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  assert(0 && "ARMAsmBackend::RelaxInstruction() unimplemented");
-  return;
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  // The instructions we're relaxing have (so far) the same operands.
+  // We just need to update to the proper opcode.
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
 }
 
 bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 865c3e2..c38a882 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1412,7 +1412,7 @@ getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
                           SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(Op);
   if (MO.getReg() == 0) return 0x0D;
-  return MO.getReg();
+  return getARMRegisterNumbering(MO.getReg());
 }
 
 unsigned ARMMCCodeEmitter::
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 352c73e..f394b4f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -178,9 +179,16 @@ RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
   case ARM::fixup_arm_movt_hi16:
   case ARM::fixup_arm_movt_hi16_pcrel:
     MovtBit = 1;
+    // The thumb bit shouldn't be set in the 'other-half' bit of the
+    // relocation, but it will be set in FixedValue if the base symbol
+    // is a thumb function. Clear it out here.
+    if (A_SD->getFlags() & SF_ThumbFunc)
+      FixedValue &= 0xfffffffe;
     break;
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movt_hi16_pcrel:
+    if (A_SD->getFlags() & SF_ThumbFunc)
+      FixedValue &= 0xfffffffe;
     MovtBit = 1;
     // Fallthrough
   case ARM::fixup_t2_movw_lo16:
@@ -189,7 +197,6 @@ RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
     break;
   }
 
-
   if (Type == macho::RIT_ARM_HalfDifference) {
     uint32_t OtherHalf = MovtBit
       ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index f529314..f2cf78a 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -10,10 +10,3 @@ add_dependencies(LLVMARMDesc ARMCommonTableGen)
 
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
-
-add_llvm_library_dependencies(LLVMARMDesc
-  LLVMARMAsmPrinter
-  LLVMARMInfo
-  LLVMMC
-  LLVMSupport
-  )
diff --git a/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt
index 46b11c7..2a7fe61 100644
--- a/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = ARMDesc
 parent = ARM
 required_libraries = ARMAsmPrinter ARMInfo MC Support
 add_to_library_groups = ARM
-
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 2df0053..000a37f 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -139,7 +139,7 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
   // FIXME: Detect integer instructions properly.
   const MCInstrDesc &MCID = MI->getDesc();
   unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
-  if (MCID.mayStore())
+  if (MI->mayStore())
     return false;
   unsigned Opcode = MCID.getOpcode();
   if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
@@ -222,14 +222,14 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
   unsigned TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI));
 
-  MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), MCID1, TmpReg)
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
     .addReg(Src2Reg, getKillRegState(Src2Kill));
   if (HasLane)
     MIB.addImm(LaneImm);
   MIB.addImm(Pred).addReg(PredReg);
 
-  MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), MCID2)
+  MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2)
     .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
 
   if (NegAcc) {
@@ -274,7 +274,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
     }
 
     const MCInstrDesc &MCID = MI->getDesc();
-    if (MCID.isBarrier()) {
+    if (MI->isBarrier()) {
       clearStack();
       Skip = 0;
       ++MII;
diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt
index 8b38b13..533e747 100644
--- a/lib/Target/ARM/TargetInfo/CMakeLists.txt
+++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt
@@ -5,9 +5,3 @@ add_llvm_library(LLVMARMInfo
   )
 
 add_dependencies(LLVMARMInfo ARMCommonTableGen)
-
-add_llvm_library_dependencies(LLVMARMInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
diff --git a/lib/Target/ARM/TargetInfo/LLVMBuild.txt b/lib/Target/ARM/TargetInfo/LLVMBuild.txt
index 046c1fc..a07a940 100644
--- a/lib/Target/ARM/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/ARM/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = ARMInfo
 parent = ARM
 required_libraries = MC Support Target
 add_to_library_groups = ARM
-
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index e8ed482..e61c0a7 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -643,14 +643,13 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   assert(Offset && "This code isn't needed if offset already handled!");
 
   unsigned Opcode = MI.getOpcode();
-  const MCInstrDesc &Desc = MI.getDesc();
 
   // Remove predicate first.
   int PIdx = MI.findFirstPredOperandIdx();
   if (PIdx != -1)
     removeOperands(MI, PIdx);
 
-  if (Desc.mayLoad()) {
+  if (MI.mayLoad()) {
     // Use the destination register to materialize sp + offset.
     unsigned TmpReg = MI.getOperand(0).getReg();
     bool UseRR = false;
@@ -673,7 +672,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
       // register. The offset is already handled in the vreg value.
       MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false);
-  } else if (Desc.mayStore()) {
+  } else if (MI.mayStore()) {
       VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
       bool UseRR = false;
 
@@ -699,7 +698,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   // Add predicate back if it's needed.
-  if (MI.getDesc().isPredicable()) {
+  if (MI.isPredicable()) {
     MachineInstrBuilder MIB(&MI);
     AddDefaultPred(MIB);
   }
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index b627400..55b4d30 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -13,6 +13,7 @@
 #include "Thumb2InstrInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -141,7 +142,7 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
   //   rsb   r2, 0
   //
   const MCInstrDesc &MCID = MI->getDesc();
-  if (MCID.hasOptionalDef() &&
+  if (MI->hasOptionalDef() &&
       MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR)
     return false;
 
@@ -198,7 +199,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     // Branches, including tricky ones like LDM_RET, need to end an IT
     // block so check the instruction we just put in the block.
     for (; MBBI != E && Pos &&
-           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn()) ; ++MBBI) {
+           (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
       if (MBBI->isDebugValue())
         continue;
 
@@ -237,6 +238,9 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     // Last instruction in IT block kills ITSTATE.
     LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
 
+    // Finalize the bundle.
+    FinalizeBundle(MBB, InsertPos.getInstrIterator(), LastITMI);
+
     Modified = true;
     ++NumITs;
   }
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index e5fc8b4..e206288 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -452,7 +452,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
 
   // Add the 16-bit load / store instruction.
   DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc));
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
   if (!isLdStMul) {
     MIB.addOperand(MI->getOperand(0));
     MIB.addOperand(MI->getOperand(1));
@@ -478,7 +478,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
 
   DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
 
-  MBB.erase(MI);
+  MBB.erase_instr(MI);
   ++NumLdSts;
   return true;
 }
@@ -513,7 +513,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
         MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
       return false;
 
-    MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(),
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
                                       TII->get(ARM::tADDrSPi))
       .addOperand(MI->getOperand(0))
       .addOperand(MI->getOperand(1))
@@ -525,7 +525,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
 
     DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " <<*MIB);
 
-    MBB.erase(MI);
+    MBB.erase_instr(MI);
     ++NumNarrows;
     return true;
   }
@@ -533,8 +533,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   if (Entry.LowRegs1 && !VerifyLowRegs(MI))
     return false;
 
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (MCID.mayLoad() || MCID.mayStore())
+  if (MI->mayLoad() || MI->mayStore())
     return ReduceLoadStore(MBB, MI, Entry);
 
   switch (Opc) {
@@ -654,7 +653,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
 
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewMCID);
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
   MIB.addOperand(MI->getOperand(0));
   if (NewMCID.hasOptionalDef()) {
     if (HasCC)
@@ -678,7 +677,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
 
   DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
 
-  MBB.erase(MI);
+  MBB.erase_instr(MI);
   ++Num2Addrs;
   return true;
 }
@@ -745,7 +744,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
 
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewMCID);
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
   MIB.addOperand(MI->getOperand(0));
   if (NewMCID.hasOptionalDef()) {
     if (HasCC)
@@ -785,7 +784,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
 
   DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
 
-  MBB.erase(MI);
+  MBB.erase_instr(MI);
   ++NumNarrows;
   return true;
 }
@@ -830,16 +829,22 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
   // Yes, CPSR could be livein.
   bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
   MachineInstr *CPSRDef = 0;
+  MachineInstr *BundleMI = 0;
 
   // If this BB loops back to itself, conservatively avoid narrowing the
   // first instruction that does partial flag update.
   bool IsSelfLoop = MBB.isSuccessor(&MBB);
-  MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
-  MachineBasicBlock::iterator NextMII;
+  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(), E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator NextMII;
   for (; MII != E; MII = NextMII) {
     NextMII = llvm::next(MII);
 
     MachineInstr *MI = &*MII;
+    if (MI->isBundle()) {
+      BundleMI = MI;
+      continue;
+    }
+
     LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
 
     unsigned Opcode = MI->getOpcode();
@@ -850,7 +855,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       if (Entry.Special) {
         if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
           Modified = true;
-          MachineBasicBlock::iterator I = prior(NextMII);
+          MachineBasicBlock::instr_iterator I = prior(NextMII);
           MI = &*I;
         }
         goto ProcessNext;
@@ -860,7 +865,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       if (Entry.NarrowOpc2 &&
           ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
         Modified = true;
-        MachineBasicBlock::iterator I = prior(NextMII);
+        MachineBasicBlock::instr_iterator I = prior(NextMII);
         MI = &*I;
         goto ProcessNext;
       }
@@ -869,15 +874,24 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       if (Entry.NarrowOpc1 &&
           ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
         Modified = true;
-        MachineBasicBlock::iterator I = prior(NextMII);
+        MachineBasicBlock::instr_iterator I = prior(NextMII);
         MI = &*I;
       }
     }
 
   ProcessNext:
+    if (LiveCPSR &&
+        NextMII != E && MI->isInsideBundle() && !NextMII->isInsideBundle() &&
+        BundleMI->killsRegister(ARM::CPSR))
+      // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
+      // marker is only on the BUNDLE instruction. Process the BUNDLE
+      // instruction as we finish with the bundled instruction to work around
+      // the inconsistency.
+      LiveCPSR = false;
+
     bool DefCPSR = false;
     LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
-    if (MI->getDesc().isCall()) {
+    if (MI->isCall()) {
       // Calls don't really set CPSR.
       CPSRDef = 0;
       IsSelfLoop = false;
diff --git a/lib/Target/CBackend/CMakeLists.txt b/lib/Target/CBackend/CMakeLists.txt
index edf8ee7..fa819a4 100644
--- a/lib/Target/CBackend/CMakeLists.txt
+++ b/lib/Target/CBackend/CMakeLists.txt
@@ -2,16 +2,4 @@ add_llvm_target(CBackendCodeGen
   CBackend.cpp
   )
 
-add_llvm_library_dependencies(LLVMCBackendCodeGen
-  LLVMAnalysis
-  LLVMCBackendInfo
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMScalarOpts
-  LLVMSupport
-  LLVMTarget
-  LLVMTransformUtils
-  )
-
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
index ca346af..8b2286e 100644
--- a/lib/Target/CBackend/CTargetMachine.h
+++ b/lib/Target/CBackend/CTargetMachine.h
@@ -21,10 +21,10 @@ namespace llvm {
 
 struct CTargetMachine : public TargetMachine {
   CTargetMachine(const Target &T, StringRef TT,
-                 StringRef CPU, StringRef FS,
+                 StringRef CPU, StringRef FS, const TargetOptions &Options,
                  Reloc::Model RM, CodeModel::Model CM,
                  CodeGenOpt::Level OL)
-    : TargetMachine(T, TT, CPU, FS) {}
+    : TargetMachine(T, TT, CPU, FS, Options) { }
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
diff --git a/lib/Target/CBackend/LLVMBuild.txt b/lib/Target/CBackend/LLVMBuild.txt
index 851ded9..e64feb0 100644
--- a/lib/Target/CBackend/LLVMBuild.txt
+++ b/lib/Target/CBackend/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = TargetInfo
+
 [component_0]
 type = TargetGroup
 name = CBackend
@@ -26,4 +29,3 @@ name = CBackendCodeGen
 parent = CBackend
 required_libraries = Analysis CBackendInfo CodeGen Core MC Scalar Support Target TransformUtils
 add_to_library_groups = CBackend
-
diff --git a/lib/Target/CBackend/TargetInfo/CMakeLists.txt b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
index 8e616be..6203616 100644
--- a/lib/Target/CBackend/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
@@ -3,9 +3,3 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMCBackendInfo
   CBackendTargetInfo.cpp
   )
-
-add_llvm_library_dependencies(LLVMCBackendInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
diff --git a/lib/Target/CBackend/TargetInfo/LLVMBuild.txt b/lib/Target/CBackend/TargetInfo/LLVMBuild.txt
index 35752b7..1b47d8e 100644
--- a/lib/Target/CBackend/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/CBackend/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = CBackendInfo
 parent = CBackend
 required_libraries = MC Support Target
 add_to_library_groups = CBackend
-
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 60e2189..22d8c76 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -3,7 +3,6 @@ add_llvm_library(LLVMTarget
   Target.cpp
   TargetData.cpp
   TargetELFWriterInfo.cpp
-  TargetFrameLowering.cpp
   TargetInstrInfo.cpp
   TargetIntrinsicInfo.cpp
   TargetLibraryInfo.cpp
@@ -13,12 +12,6 @@ add_llvm_library(LLVMTarget
   TargetSubtargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMTarget
-  LLVMCore
-  LLVMMC
-  LLVMSupport
-  )
-
 foreach(t ${LLVM_TARGETS_TO_BUILD})
   message(STATUS "Targeting ${t}")
   add_subdirectory(${t})
diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
index b442a5c..6c67c2d 100644
--- a/lib/Target/CellSPU/CMakeLists.txt
+++ b/lib/Target/CellSPU/CMakeLists.txt
@@ -23,17 +23,5 @@ add_llvm_target(CellSPUCodeGen
   SPUNopFiller.cpp
   )
 
-add_llvm_library_dependencies(LLVMCellSPUCodeGen
-  LLVMAsmPrinter
-  LLVMCellSPUDesc
-  LLVMCellSPUInfo
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/CellSPU/LLVMBuild.txt b/lib/Target/CellSPU/LLVMBuild.txt
index 4ae26b2..277620b 100644
--- a/lib/Target/CellSPU/LLVMBuild.txt
+++ b/lib/Target/CellSPU/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = CellSPU
@@ -27,4 +30,3 @@ name = CellSPUCodeGen
 parent = CellSPU
 required_libraries = AsmPrinter CellSPUDesc CellSPUInfo CodeGen Core MC SelectionDAG Support Target
 add_to_library_groups = CellSPU
-
diff --git a/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt b/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
index d41fe93..0027bdb 100644
--- a/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
@@ -3,9 +3,4 @@ add_llvm_library(LLVMCellSPUDesc
   SPUMCAsmInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMCellSPUDesc
-  LLVMCellSPUInfo
-  LLVMMC
-  )
-
 add_dependencies(LLVMCellSPUDesc CellSPUCommonTableGen)
diff --git a/lib/Target/CellSPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/CellSPU/MCTargetDesc/LLVMBuild.txt
index abc44a2..71e5bbc 100644
--- a/lib/Target/CellSPU/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/CellSPU/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = CellSPUDesc
 parent = CellSPU
 required_libraries = CellSPUInfo MC
 add_to_library_groups = CellSPU
-
diff --git a/lib/Target/CellSPU/SPUFrameLowering.cpp b/lib/Target/CellSPU/SPUFrameLowering.cpp
index 093f99f..916f9ba 100644
--- a/lib/Target/CellSPU/SPUFrameLowering.cpp
+++ b/lib/Target/CellSPU/SPUFrameLowering.cpp
@@ -47,7 +47,8 @@ bool SPUFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
   return MFI->getStackSize() &&
-    (DisableFramePointerElim(MF) || MFI->hasVarSizedObjects());
+    (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+     MFI->hasVarSizedObjects());
 }
 
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index d58e49b..dc0d5a6 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -296,12 +296,22 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i128,  Expand);
 
   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i128,  Expand);
 
   // SPU has a version of select that implements (a&~c)|(b&c), just like
   // select ought to work:
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 6940316..1e922a4 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -34,9 +34,10 @@ SPUFrameLowering::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
 
 SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     DataLayout(Subtarget.getTargetDataString()),
     InstrInfo(*this),
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index 909f12e..0841fee 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -39,7 +39,7 @@ class SPUTargetMachine : public LLVMTargetMachine {
   InstrItineraryData  InstrItins;
 public:
   SPUTargetMachine(const Target &T, StringRef TT,
-                   StringRef CPU, StringRef FS,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
 
diff --git a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
index 3f2d6b09..6a98f95 100644
--- a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMCellSPUInfo
   CellSPUTargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMCellSPUInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMCellSPUInfo CellSPUCommonTableGen)
diff --git a/lib/Target/CellSPU/TargetInfo/LLVMBuild.txt b/lib/Target/CellSPU/TargetInfo/LLVMBuild.txt
index 0710cc3..6937e70 100644
--- a/lib/Target/CellSPU/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/CellSPU/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = CellSPUInfo
 parent = CellSPU
 required_libraries = MC Support Target
 add_to_library_groups = CellSPU
-
diff --git a/lib/Target/CppBackend/CMakeLists.txt b/lib/Target/CppBackend/CMakeLists.txt
index 53f6868..515e1dd 100644
--- a/lib/Target/CppBackend/CMakeLists.txt
+++ b/lib/Target/CppBackend/CMakeLists.txt
@@ -2,11 +2,4 @@ add_llvm_target(CppBackendCodeGen
   CPPBackend.cpp
   )
 
-add_llvm_library_dependencies(LLVMCppBackendCodeGen
-  LLVMCore
-  LLVMCppBackendInfo
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index a3613b4..92bca6c 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -23,10 +23,10 @@ class formatted_raw_ostream;
 
 struct CPPTargetMachine : public TargetMachine {
   CPPTargetMachine(const Target &T, StringRef TT,
-                   StringRef CPU, StringRef FS,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL)
-    : TargetMachine(T, TT, CPU, FS) {}
+    : TargetMachine(T, TT, CPU, FS, Options) {}
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
diff --git a/lib/Target/CppBackend/LLVMBuild.txt b/lib/Target/CppBackend/LLVMBuild.txt
index 77e31c7..122b5e7 100644
--- a/lib/Target/CppBackend/LLVMBuild.txt
+++ b/lib/Target/CppBackend/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = TargetInfo
+
 [component_0]
 type = TargetGroup
 name = CppBackend
@@ -26,4 +29,3 @@ name = CppBackendCodeGen
 parent = CppBackend
 required_libraries = Core CppBackendInfo Support Target
 add_to_library_groups = CppBackend
-
diff --git a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
index 738b215..f82d72e 100644
--- a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
@@ -3,9 +3,3 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMCppBackendInfo
   CppBackendTargetInfo.cpp
   )
-
-add_llvm_library_dependencies(LLVMCppBackendInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
diff --git a/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt b/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt
index 67a23ba..d4dfc3e 100644
--- a/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = CppBackendInfo
 parent = CppBackend
 required_libraries = MC Support Target
 add_to_library_groups = CppBackend
-
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
new file mode 100644
index 0000000..f8705ee
--- /dev/null
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -0,0 +1,35 @@
+set(LLVM_TARGET_DEFINITIONS Hexagon.td)
+
+tablegen(LLVM HexagonGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM HexagonGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM HexagonGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM HexagonGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM HexagonGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM HexagonGenIntrinsics.inc -gen-tgt-intrinsic)
+add_public_tablegen_target(HexagonCommonTableGen)
+
+add_llvm_target(HexagonCodeGen
+  HexagonAsmPrinter.cpp
+  HexagonCallingConvLower.cpp
+  HexagonCFGOptimizer.cpp
+  HexagonExpandPredSpillCode.cpp
+  HexagonFrameLowering.cpp
+  HexagonHardwareLoops.cpp
+  HexagonInstrInfo.cpp
+  HexagonISelDAGToDAG.cpp
+  HexagonISelLowering.cpp
+  HexagonMCAsmInfo.cpp
+  HexagonOptimizeSZExtends.cpp
+  HexagonRegisterInfo.cpp
+  HexagonRemoveSZExtArgs.cpp
+  HexagonSelectionDAGInfo.cpp
+  HexagonSplitTFRCondSets.cpp
+  HexagonSubtarget.cpp
+  HexagonTargetMachine.cpp
+  HexagonTargetObjectFile.cpp
+  )
+
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
+
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
new file mode 100644
index 0000000..a5f2279
--- /dev/null
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -0,0 +1,54 @@
+//=-- Hexagon.h - Top-level interface for Hexagon representation --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Hexagon back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_Hexagon_H
+#define TARGET_Hexagon_H
+
+#include <cassert>
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  class FunctionPass;
+  class TargetMachine;
+  class HexagonTargetMachine;
+  class raw_ostream;
+
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createHexagonFPMoverPass(TargetMachine &TM);
+  FunctionPass *createHexagonRemoveExtendOps(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonCFGOptimizer(HexagonTargetMachine &TM);
+
+  FunctionPass* createHexagonSplitTFRCondSets(HexagonTargetMachine &TM);
+  FunctionPass* createHexagonExpandPredSpillCode(HexagonTargetMachine &TM);
+
+  FunctionPass *createHexagonHardwareLoops();
+  FunctionPass *createHexagonOptimizeSZExtends();
+  FunctionPass *createHexagonFixupHwLoops();
+
+} // end namespace llvm;
+
+#define Hexagon_POINTER_SIZE 4
+
+#define Hexagon_PointerSize (Hexagon_POINTER_SIZE)
+#define Hexagon_PointerSize_Bits (Hexagon_POINTER_SIZE * 8)
+#define Hexagon_WordSize Hexagon_PointerSize
+#define Hexagon_WordSize_Bits Hexagon_PointerSize_Bits
+
+// allocframe saves LR and FP on stack before allocating
+// a new stack frame. This takes 8 bytes.
+#define HEXAGON_LRFP_SIZE 8
+
+#endif
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
new file mode 100644
index 0000000..72939e6
--- /dev/null
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -0,0 +1,66 @@
+//===- Hexagon.td - Describe the Hexagon Target Machine ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Hexagon Subtarget features.
+//
+
+
+// Hexagon Archtectures
+def ArchV2       : SubtargetFeature<"v2", "HexagonArchVersion", "V2",
+                                    "Hexagon v2">;
+def ArchV3       : SubtargetFeature<"v3", "HexagonArchVersion", "V3",
+                                    "Hexagon v3">;
+def ArchV4       : SubtargetFeature<"v4", "HexagonArchVersion", "V4",
+                                    "Hexagon v4">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+include "HexagonSchedule.td"
+include "HexagonRegisterInfo.td"
+include "HexagonCallingConv.td"
+include "HexagonInstrInfo.td"
+include "HexagonIntrinsics.td"
+include "HexagonIntrinsicsDerived.td"
+
+
+def HexagonInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, ProcessorItineraries Itin,
+           list<SubtargetFeature> Features>
+ : Processor<Name, Itin, Features>;
+
+def : Proc<"hexagonv2", HexagonItineraries,   [ArchV2]>;
+def : Proc<"hexagonv3", HexagonItineraries,   [ArchV2, ArchV3]>;
+def : Proc<"hexagonv4", HexagonItinerariesV4, [ArchV2, ArchV3, ArchV4]>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Hexagon : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = HexagonInstrInfo;
+}
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
new file mode 100644
index 0000000..8f8e804
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -0,0 +1,555 @@
+//===-- HexagonAsmPrinter.cpp - Print machine instrs to Hexagon assembly ----=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Hexagon assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+
+#define DEBUG_TYPE "asm-printer"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> AlignCalls(
+         "hexagon-align-calls", cl::Hidden, cl::init(true),
+          cl::desc("Insert falign after call instruction for Hexagon target"));
+
+
+namespace {
+  class HexagonAsmPrinter : public AsmPrinter {
+    const HexagonSubtarget *Subtarget;
+
+  public:
+    explicit HexagonAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {
+      Subtarget = &TM.getSubtarget<HexagonSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "Hexagon Assembly Printer";
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    void printInstruction(const MachineInstr *MI, raw_ostream &O);
+    virtual void EmitInstruction(const MachineInstr *MI);
+
+    void printOp(const MachineOperand &MO, raw_ostream &O);
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero,
+                       raw_ostream &O) {
+      unsigned RegNo = MO.getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
+      O << getRegisterName(RegNo);
+    }
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isReg()) {
+        printRegister(MO, false, OS);
+      } else if (MO.isImm()) {
+        OS << MO.getImm();
+      } else {
+        printOp(MO, OS);
+      }
+    }
+
+
+    bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &OS);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS);
+
+
+    void printHexagonImmOperand(const MachineInstr *MI, unsigned OpNo,
+                                raw_ostream &O) {
+      int value = MI->getOperand(OpNo).getImm();
+      O << value;
+    }
+
+
+    void printHexagonNegImmOperand(const MachineInstr *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+      int value = MI->getOperand(OpNo).getImm();
+      O << -value;
+    }
+
+    void printHexagonMEMriOperand(const MachineInstr *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+      const MachineOperand &MO1 = MI->getOperand(OpNo);
+      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+      O << getRegisterName(MO1.getReg())
+        << " + #"
+        << (int) MO2.getImm();
+    }
+
+
+    void printHexagonFrameIndexOperand(const MachineInstr *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+      const MachineOperand &MO1 = MI->getOperand(OpNo);
+      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+      O << getRegisterName(MO1.getReg())
+        << ", #"
+        << MO2.getImm();
+    }
+
+    void printBranchOperand(const MachineInstr *MI, unsigned OpNo,
+                            raw_ostream &O) {
+      // Branches can take an immediate operand.  This is used by the branch
+      // selection pass to print $+8, an eight byte displacement from the PC.
+      if (MI->getOperand(OpNo).isImm()) {
+        O << "$+" << MI->getOperand(OpNo).getImm()*4;
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo,
+                          raw_ostream &O) {
+    }
+
+    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo,
+                            raw_ostream &O) {
+    }
+
+
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      O << "#HI(";
+      if (MI->getOperand(OpNo).isImm()) {
+        printHexagonImmOperand(MI, OpNo, O);
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+      O << ")";
+    }
+
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      O << "#HI(";
+      if (MI->getOperand(OpNo).isImm()) {
+        printHexagonImmOperand(MI, OpNo, O);
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+      O << ")";
+    }
+
+    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
+                               raw_ostream &O);
+
+    void printAddrModeBasePlusOffset(const MachineInstr *MI, int OpNo,
+                                     raw_ostream &O);
+
+    void printGlobalOperand(const MachineInstr *MI, int OpNo, raw_ostream &O);
+    void printJumpTable(const MachineInstr *MI, int OpNo, raw_ostream &O);
+
+    void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const;
+
+    static const char *getRegisterName(unsigned RegNo);
+  };
+
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer.
+#include "HexagonGenAsmWriter.inc"
+
+
+void HexagonAsmPrinter::EmitAlignment(unsigned NumBits,
+                                      const GlobalValue *GV) const {
+
+  // For basic block level alignment, use falign.
+  if (!GV) {
+    OutStreamer.EmitRawText(StringRef("\t.falign"));
+    return;
+  }
+
+  AsmPrinter::EmitAlignment(NumBits, GV);
+}
+
+void HexagonAsmPrinter::printOp(const MachineOperand &MO, raw_ostream &O) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    dbgs() << "printOp() does not handle immediate values\n";
+    abort();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << *GetJTISymbol(MO.getIndex());
+    // FIXME: PIC relocation model.
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    O << *Mang->getSymbol(MO.getGlobal());
+    printOffset(MO.getOffset(), O);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+
+//
+// isBlockOnlyReachableByFallthrough - We need to override this since the
+// default AsmPrinter does not print labels for any basic block that
+// is only reachable by a fall through. That works for all cases except
+// for the case in which the basic block is reachable by a fall through but
+// through an indirect from a jump table. In this case, the jump table
+// will contain a label not defined by AsmPrinter.
+//
+bool HexagonAsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  if (MBB->hasAddressTaken()) {
+    return false;
+  }
+  return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
+}
+
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode,
+                                      raw_ostream &OS) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+      // Hexagon never has a prefix.
+      printOperand(MI, OpNo, OS);
+      return false;
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        OS << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo, OS);
+  return false;
+}
+
+bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &Base  = MI->getOperand(OpNo);
+  const MachineOperand &Offset = MI->getOperand(OpNo+1);
+
+  if (Base.isReg())
+    printOperand(MI, OpNo, O);
+  else
+    assert(0 && "Unimplemented");
+
+  if (Offset.isImm()) {
+    if (Offset.getImm())
+      O << " + #" << Offset.getImm();
+  }
+  else
+    assert(0 && "Unimplemented");
+
+  return false;
+}
+
+void HexagonAsmPrinter::printPredicateOperand(const MachineInstr *MI,
+                                              unsigned OpNo,
+                                              raw_ostream &O) {
+  assert(0 && "Unimplemented");
+}
+
+
+/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
+/// the current output stream.
+///
+void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  const MachineFunction* MF = MI->getParent()->getParent();
+  const HexagonMachineFunctionInfo* MFI =
+    (const HexagonMachineFunctionInfo*)
+    MF->getInfo<HexagonMachineFunctionInfo>();
+
+
+
+  // Print a brace for the beginning of the packet.
+  if (MFI->isStartPacket(MI)) {
+    O << "\t{" << '\n';
+  }
+
+  DEBUG( O << "// MI = " << *MI << '\n';);
+
+  // Indent
+  O << "\t";
+
+
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+    if (MFI->isEndPacket(MI) && MFI->isStartPacket(MI)) {
+      O << "\t{ nop }";
+    } else {
+    O << "}";
+    }
+    printInstruction(MI, O);
+  } else if (MI->getOpcode() == Hexagon::STriwt) {
+    //
+    // Handle truncated store on Hexagon.
+    //
+    O << "\tmemw(";
+    printHexagonMEMriOperand(MI, 0, O);
+
+    O << ") = ";
+    unsigned SubRegNum =
+      TM.getRegisterInfo()->getSubReg(MI->getOperand(2)
+                                      .getReg(), Hexagon::subreg_loreg);
+    const char *SubRegName = getRegisterName(SubRegNum);
+    O << SubRegName << '\n';
+  } else if (MI->getOpcode() == Hexagon::MPYI_rin) {
+    // Handle multipy with -ve constant on Hexagon:
+    // "$dst =- mpyi($src1, #$src2)"
+      printOperand(MI, 0, O);
+    O << " =- mpyi(";
+    printOperand(MI, 1, O);
+    O << ", #";
+    printHexagonNegImmOperand(MI, 2, O);
+    O << ")";
+  } else if (MI->getOpcode() == Hexagon::MEMw_ADDSUBi_indexed_MEM_V4) {
+    //
+    // Handle memw(Rs+u6:2) [+-]= #U5
+    //
+    O << "\tmemw("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
+    int addend = MI->getOperand(2).getImm();
+    if (addend < 0)
+      O << "-= " << "#" << -addend << '\n';
+    else
+      O << "+= " << "#" << addend << '\n';
+  } else if (MI->getOpcode() == Hexagon::MEMw_ADDSUBi_MEM_V4) {
+    //
+    // Handle memw(Rs+u6:2) [+-]= #U5
+    //
+    O << "\tmemw("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
+    int addend = MI->getOperand(2).getImm();
+    if (addend < 0)
+      O << "-= " << "#" << -addend << '\n';
+    else
+      O << "+= " << "#" << addend << '\n';
+  } else if (MI->getOpcode() == Hexagon::MEMh_ADDSUBi_indexed_MEM_V4) {
+    //
+    // Handle memh(Rs+u6:1) [+-]= #U5
+    //
+    O << "\tmemh("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
+    int addend = MI->getOperand(2).getImm();
+    if (addend < 0)
+      O << "-= " << "#" << -addend << '\n';
+    else
+      O << "+= " << "#" << addend << '\n';
+  } else if (MI->getOpcode() == Hexagon::MEMh_ADDSUBi_MEM_V4) {
+    //
+    // Handle memh(Rs+u6:1) [+-]= #U5
+    //
+    O << "\tmemh("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
+    int addend = MI->getOperand(2).getImm();
+    if (addend < 0)
+      O << "-= " << "#" << -addend << '\n';
+    else
+      O << "+= " << "#" << addend << '\n';
+  } else if (MI->getOpcode() == Hexagon::MEMb_ADDSUBi_indexed_MEM_V4) {
+    //
+    // Handle memb(Rs+u6:1) [+-]= #U5
+    //
+    O << "\tmemb("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
+    int addend = MI->getOperand(2).getImm();
+    if (addend < 0)
+      O << "-= " << "#" << -addend << '\n';
+    else
+      O << "+= " << "#" << addend << '\n';
+  } else if (MI->getOpcode() == Hexagon::MEMb_ADDSUBi_MEM_V4) {
+    //
+    // Handle memb(Rs+u6:1) [+-]= #U5
+    //
+    O << "\tmemb("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
+    int addend = MI->getOperand(2).getImm();
+    if (addend < 0)
+      O << "-= " << "#" << -addend << '\n';
+    else
+      O << "+= " << "#" << addend << '\n';
+  } else if (MI->getOpcode() == Hexagon::CMPbGTri_V4) {
+    //
+    // Handle Pd=cmpb.gt(Rs,#s8)
+    //
+    O << "\t";
+    printRegister(MI->getOperand(0), false, O);
+    O << " = cmpb.gt(";
+    printRegister(MI->getOperand(1), false, O);
+    O << ", ";
+    int val = MI->getOperand(2).getImm() >> 24;
+    O << "#" << val << ")" << '\n';
+  } else if (MI->getOpcode() == Hexagon::CMPhEQri_V4) {
+    //
+    // Handle Pd=cmph.eq(Rs,#8)
+    //
+    O << "\t";
+    printRegister(MI->getOperand(0), false, O);
+    O << " = cmph.eq(";
+    printRegister(MI->getOperand(1), false, O);
+    O << ", ";
+    int val = MI->getOperand(2).getImm();
+    assert((((0 <= val) && (val <= 127)) ||
+            ((65408 <= val) && (val <= 65535))) &&
+           "Not in correct range!");
+    if (val >= 65408) val -= 65536;
+    O << "#" << val << ")" << '\n';
+  } else if (MI->getOpcode() == Hexagon::CMPhGTri_V4) {
+    //
+    // Handle Pd=cmph.gt(Rs,#8)
+    //
+    O << "\t";
+    printRegister(MI->getOperand(0), false, O);
+    O << " = cmph.gt(";
+    printRegister(MI->getOperand(1), false, O);
+    O << ", ";
+    int val = MI->getOperand(2).getImm() >> 16;
+    O << "#" << val << ")" << '\n';
+  } else {
+    printInstruction(MI, O);
+  }
+
+  // Print a brace for the end of the packet.
+  if (MFI->isEndPacket(MI) && MI->getOpcode() != Hexagon::ENDLOOP0) {
+    O << "\n\t}" << '\n';
+  }
+
+  if (AlignCalls && MI->getDesc().isCall()) {
+    O << "\n\t.falign" << "\n";
+  }
+
+  OutStreamer.EmitRawText(O.str());
+  return;
+}
+
+/// PrintUnmangledNameSafely - Print out the printable characters in the name.
+/// Don't print things like \n or \0.
+// static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
+//   for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
+//        Name != E; ++Name)
+//     if (isprint(*Name))
+//       OS << *Name;
+// }
+
+
+void HexagonAsmPrinter::printAddrModeBasePlusOffset(const MachineInstr *MI,
+                                                    int OpNo, raw_ostream &O) {
+  const MachineOperand &MO1 = MI->getOperand(OpNo);
+  const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+  O << getRegisterName(MO1.getReg())
+    << " + #"
+    << MO2.getImm();
+}
+
+
+void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
+                                           raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  assert( (MO.getType() == MachineOperand::MO_GlobalAddress) &&
+         "Expecting global address");
+
+  O << *Mang->getSymbol(MO.getGlobal());
+  if (MO.getOffset() != 0) {
+    O << " + ";
+    O << MO.getOffset();
+  }
+}
+
+void HexagonAsmPrinter::printJumpTable(const MachineInstr *MI, int OpNo,
+                                       raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) &&
+	  "Expecting jump table index");
+
+  // Hexagon_TODO: Do we need name mangling?
+  O << *GetJTISymbol(MO.getIndex());
+}
+
+extern "C" void LLVMInitializeHexagonAsmPrinter() {
+  RegisterAsmPrinter<HexagonAsmPrinter> X(TheHexagonTarget);
+}
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
new file mode 100644
index 0000000..38000e7
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -0,0 +1,240 @@
+//===---- HexagonCFGOptimizer.cpp - CFG optimizations ---------------------===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#define DEBUG_TYPE "hexagon_cfg"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include <iostream>
+
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace {
+
+class HexagonCFGOptimizer : public MachineFunctionPass {
+
+private:
+  HexagonTargetMachine& QTM;
+  const HexagonSubtarget &QST;
+
+  void InvertAndChangeJumpTarget(MachineInstr*, MachineBasicBlock*);
+
+ public:
+  static char ID;
+  HexagonCFGOptimizer(HexagonTargetMachine& TM) : MachineFunctionPass(ID),
+                                                  QTM(TM),
+                                                  QST(*TM.getSubtargetImpl()) {}
+
+  const char *getPassName() const {
+    return "Hexagon CFG Optimizer";
+  }
+  bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonCFGOptimizer::ID = 0;
+
+static bool IsConditionalBranch(int Opc) {
+  return (Opc == Hexagon::JMP_Pred) || (Opc == Hexagon::JMP_PredNot)
+    || (Opc == Hexagon::JMP_PredPt) || (Opc == Hexagon::JMP_PredNotPt);
+}
+
+
+static bool IsUnconditionalJump(int Opc) {
+  return (Opc == Hexagon::JMP);
+}
+
+
+void
+HexagonCFGOptimizer::InvertAndChangeJumpTarget(MachineInstr* MI,
+                                               MachineBasicBlock* NewTarget) {
+  const HexagonInstrInfo *QII = QTM.getInstrInfo();
+  int NewOpcode = 0;
+  switch(MI->getOpcode()) {
+  case Hexagon::JMP_Pred:
+    NewOpcode = Hexagon::JMP_PredNot;
+    break;
+
+  case Hexagon::JMP_PredNot:
+    NewOpcode = Hexagon::JMP_Pred;
+    break;
+
+  case Hexagon::JMP_PredPt:
+    NewOpcode = Hexagon::JMP_PredNotPt;
+    break;
+
+  case Hexagon::JMP_PredNotPt:
+    NewOpcode = Hexagon::JMP_PredPt;
+    break;
+
+  default:
+    assert(0 && "Cannot handle this case");
+  }
+
+  MI->setDesc(QII->get(NewOpcode));
+  MI->getOperand(1).setMBB(NewTarget);
+}
+
+
+bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+
+    // Traverse the basic block.
+    MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
+    if (MII != MBB->end()) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (IsConditionalBranch(Opc)) {
+
+        //
+        // (Case 1) Transform the code if the following condition occurs:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...next block in layout is BB3...
+        //   BB3: ...
+        //
+        //  Transform this to:
+        //  BB1: if (!p0) jump BB4
+        //  Remove BB2
+        //  BB3: ...
+        //
+        // (Case 2) A variation occurs when BB3 contains a JMP to BB4:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...other basic blocks ...
+        //   BB4:
+        //   ...not a fall-thru
+        //   BB3: ...
+        //     jump BB4
+        //
+        // Transform this to:
+        //   BB1: if (!p0) jump BB4
+        //   Remove BB2
+        //   BB3: ...
+        //   BB4: ...
+        //
+        unsigned NumSuccs = MBB->succ_size();
+        MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
+        MachineBasicBlock* FirstSucc = *SI;
+        MachineBasicBlock* SecondSucc = *(++SI);
+        MachineBasicBlock* LayoutSucc = NULL;
+        MachineBasicBlock* JumpAroundTarget = NULL;
+
+        if (MBB->isLayoutSuccessor(FirstSucc)) {
+          LayoutSucc = FirstSucc;
+          JumpAroundTarget = SecondSucc;
+        } else if (MBB->isLayoutSuccessor(SecondSucc)) {
+          LayoutSucc = SecondSucc;
+          JumpAroundTarget = FirstSucc;
+        } else {
+          // Odd case...cannot handle.
+        }
+
+        // The target of the unconditional branch must be JumpAroundTarget.
+        // TODO: If not, we should not invert the unconditional branch.
+        MachineBasicBlock* CondBranchTarget = NULL;
+        if ((MI->getOpcode() == Hexagon::JMP_Pred) ||
+            (MI->getOpcode() == Hexagon::JMP_PredNot)) {
+          CondBranchTarget = MI->getOperand(1).getMBB();
+        }
+
+        if (!LayoutSucc || (CondBranchTarget != JumpAroundTarget)) {
+          continue;
+        }
+
+        if ((NumSuccs == 2) && LayoutSucc && (LayoutSucc->pred_size() == 1)) {
+
+          // Ensure that BB2 has one instruction -- an unconditional jump.
+          if ((LayoutSucc->size() == 1) &&
+              IsUnconditionalJump(LayoutSucc->front().getOpcode())) {
+            MachineBasicBlock* UncondTarget =
+              LayoutSucc->front().getOperand(0).getMBB();
+            // Check if the layout successor of BB2 is BB3.
+            bool case1 = LayoutSucc->isLayoutSuccessor(JumpAroundTarget);
+            bool case2 = JumpAroundTarget->isSuccessor(UncondTarget) &&
+              JumpAroundTarget->size() >= 1 &&
+              IsUnconditionalJump(JumpAroundTarget->back().getOpcode()) &&
+              JumpAroundTarget->pred_size() == 1 &&
+              JumpAroundTarget->succ_size() == 1;
+
+            if (case1 || case2) {
+              InvertAndChangeJumpTarget(MI, UncondTarget);
+              MBB->removeSuccessor(JumpAroundTarget);
+              MBB->addSuccessor(UncondTarget);
+
+              // Remove the unconditional branch in LayoutSucc.
+              LayoutSucc->erase(LayoutSucc->begin());
+              LayoutSucc->removeSuccessor(UncondTarget);
+              LayoutSucc->addSuccessor(JumpAroundTarget);
+
+              // This code performs the conversion for case 2, which moves
+              // the block to the fall-thru case (BB3 in the code above).
+              if (case2 && !case1) {
+                JumpAroundTarget->moveAfter(LayoutSucc);
+                // only move a block if it doesn't have a fall-thru. otherwise
+                // the CFG will be incorrect.
+                if (!UncondTarget->canFallThrough()) {
+                  UncondTarget->moveAfter(JumpAroundTarget);
+                }
+              }
+
+              //
+              // Correct live-in information. Is used by post-RA scheduler
+              // The live-in to LayoutSucc is now all values live-in to
+              // JumpAroundTarget.
+              //
+              std::vector<unsigned> OrigLiveIn(LayoutSucc->livein_begin(),
+                                               LayoutSucc->livein_end());
+              std::vector<unsigned> NewLiveIn(JumpAroundTarget->livein_begin(),
+                                              JumpAroundTarget->livein_end());
+              for (unsigned i = 0; i < OrigLiveIn.size(); ++i) {
+                LayoutSucc->removeLiveIn(OrigLiveIn[i]);
+              }
+              for (unsigned i = 0; i < NewLiveIn.size(); ++i) {
+                LayoutSucc->addLiveIn(NewLiveIn[i]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonCFGOptimizer(HexagonTargetMachine &TM) {
+  return new HexagonCFGOptimizer(TM);
+}
diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td
new file mode 100644
index 0000000..bd9608b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCallingConv.td
@@ -0,0 +1,35 @@
+//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Hexagon architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Hexagon 32-bit C return-value convention.
+def RetCC_Hexagon32 : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Hexagon 32-bit C Calling convention.
+def CC_Hexagon32 : CallingConv<[
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
new file mode 100644
index 0000000..2e51dbf
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -0,0 +1,207 @@
+//===-- llvm/CallingConvLower.cpp - Calling Convention lowering -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon_CCState class, used for lowering and
+// implementing calling conventions. Adapted from the machine independent
+// version of the class (CCState) but this handles calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonCallingConvLower.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "Hexagon.h"
+using namespace llvm;
+
+Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
+                                 const TargetMachine &tm,
+                                 SmallVector<CCValAssign, 16> &locs,
+                                 LLVMContext &c)
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
+    TRI(*TM.getRegisterInfo()), Locs(locs), Context(c) {
+  // No stack is used.
+  StackOffset = 0;
+
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+// HandleByVal - Allocate a stack slot large enough to pass an argument by
+// value. The size and alignment information of the argument is encoded in its
+// parameter attribute.
+void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
+                                EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                int MinSize, int MinAlign,
+                                ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  unsigned Offset = AllocateStack(Size, Align);
+
+  addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset,
+                             LocVT.getSimpleVT(), LocInfo));
+}
+
+/// MarkAllocated - Mark a register and all of its aliases as allocated.
+void Hexagon_CCState::MarkAllocated(unsigned Reg) {
+  UsedRegs[Reg/32] |= 1 << (Reg&31);
+
+  if (const unsigned *RegAliases = TRI.getAliasSet(Reg))
+    for (; (Reg = *RegAliases); ++RegAliases)
+      UsedRegs[Reg/32] |= 1 << (Reg&31);
+}
+
+/// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
+/// incorporating info about the formals into this state.
+void
+Hexagon_CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                        Hexagon_CCAssignFn Fn,
+                                        unsigned SretValueInRegs) {
+  unsigned NumArgs = Ins.size();
+  unsigned i = 0;
+
+  // If the function returns a small struct in registers, skip
+  // over the first (dummy) argument.
+  if (SretValueInRegs != 0) {
+    ++i;
+  }
+
+
+  for (; i != NumArgs; ++i) {
+    EVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, 0, 0, false)) {
+      dbgs() << "Formal argument #" << i << " has unhandled type "
+             << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
+/// incorporating info about the result values into this state.
+void
+Hexagon_CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               Hexagon_CCAssignFn Fn,
+                               unsigned SretValueInRegs) {
+
+  // For Hexagon, Return small structures in registers.
+  if (SretValueInRegs != 0) {
+    if (SretValueInRegs <= 32) {
+      unsigned Reg = Hexagon::R0;
+      addLoc(CCValAssign::getReg(0, MVT::i32, Reg, MVT::i32,
+                                 CCValAssign::Full));
+      return;
+    }
+    if (SretValueInRegs <= 64) {
+      unsigned Reg = Hexagon::D0;
+      addLoc(CCValAssign::getReg(0, MVT::i64, Reg, MVT::i64,
+                                 CCValAssign::Full));
+      return;
+    }
+  }
+
+
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this, -1, -1, false)){
+      dbgs() << "Return operand #" << i << " has unhandled type "
+           << VT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+
+/// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
+/// about the passed values into this state.
+void
+Hexagon_CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg>
+                                     &Outs,
+                                     Hexagon_CCAssignFn Fn,
+                                     int NonVarArgsParams,
+                                     unsigned SretValueSize) {
+  unsigned NumOps = Outs.size();
+
+  unsigned i = 0;
+  // If the called function returns a small struct in registers, skip
+  // the first actual parameter. We do not want to pass a pointer to
+  // the stack location.
+  if (SretValueSize != 0) {
+    ++i;
+  }
+
+  for (; i != NumOps; ++i) {
+    EVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this,
+           NonVarArgsParams, i+1, false)) {
+      dbgs() << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Same as above except it takes vectors of types
+/// and argument flags.
+void
+Hexagon_CCState::AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                                     SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                     Hexagon_CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    EVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, -1, -1,
+           false)) {
+      dbgs() << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
+/// incorporating info about the passed values into this state.
+void
+Hexagon_CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   Hexagon_CCAssignFn Fn,
+                                   unsigned SretValueInRegs) {
+
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    EVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+      if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this, -1, -1, false)) {
+        dbgs() << "Call result #" << i << " has unhandled type "
+               << VT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Same as above except it's specialized for calls which
+/// produce a single value.
+void Hexagon_CCState::AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this, -1, -1,
+         false)) {
+    dbgs() << "Call result has unhandled type "
+         << VT.getEVTString() << "\n";
+    abort();
+  }
+}
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.h b/lib/Target/Hexagon/HexagonCallingConvLower.h
new file mode 100644
index 0000000..1f601e8
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.h
@@ -0,0 +1,189 @@
+//===-- HexagonCallingConvLower.h - Calling Conventions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon_CCState class, used for lowering
+// and implementing calling conventions. Adapted from the target independent
+// version but this handles calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
+#define LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+//
+// Need to handle varargs.
+//
+namespace llvm {
+  class TargetRegisterInfo;
+  class TargetMachine;
+  class Hexagon_CCState;
+  class SDNode;
+
+
+/// Hexagon_CCAssignFn - This function assigns a location for Val, updating
+/// State to reflect the change.
+typedef bool Hexagon_CCAssignFn(unsigned ValNo, EVT ValVT,
+                              EVT LocVT, CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, Hexagon_CCState &State,
+                              int NonVarArgsParams,
+                              int CurrentParam,
+                              bool ForceMem);
+
+
+/// CCState - This class holds information needed while lowering arguments and
+/// return values.  It captures which registers are already assigned and which
+/// stack slots are used.  It provides accessors to allocate these values.
+class Hexagon_CCState {
+  CallingConv::ID CallingConv;
+  bool IsVarArg;
+  const TargetMachine &TM;
+  const TargetRegisterInfo &TRI;
+  SmallVector<CCValAssign, 16> &Locs;
+  LLVMContext &Context;
+
+  unsigned StackOffset;
+  SmallVector<uint32_t, 16> UsedRegs;
+public:
+  Hexagon_CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &TM,
+                SmallVector<CCValAssign, 16> &locs, LLVMContext &c);
+
+  void addLoc(const CCValAssign &V) {
+    Locs.push_back(V);
+  }
+
+  LLVMContext &getContext() const { return Context; }
+  const TargetMachine &getTarget() const { return TM; }
+  unsigned getCallingConv() const { return CallingConv; }
+  bool isVarArg() const { return IsVarArg; }
+
+  unsigned getNextStackOffset() const { return StackOffset; }
+
+  /// isAllocated - Return true if the specified register (or an alias) is
+  /// allocated.
+  bool isAllocated(unsigned Reg) const {
+    return UsedRegs[Reg/32] & (1 << (Reg&31));
+  }
+
+  /// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
+  /// incorporating info about the formals into this state.
+  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                              Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
+  /// incorporating info about the result values into this state.
+  void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
+  /// about the passed values into this state.
+  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           Hexagon_CCAssignFn Fn, int NonVarArgsParams,
+                           unsigned SretValueSize);
+
+  /// AnalyzeCallOperands - Same as above except it takes vectors of types
+  /// and argument flags.
+  void AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                           Hexagon_CCAssignFn Fn);
+
+  /// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
+  /// incorporating info about the passed values into this state.
+  void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                         Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeCallResult - Same as above except it's specialized for calls which
+  /// produce a single value.
+  void AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn);
+
+  /// getFirstUnallocated - Return the first unallocated register in the set, or
+  /// NumRegs if they are all allocated.
+  unsigned getFirstUnallocated(const unsigned *Regs, unsigned NumRegs) const {
+    for (unsigned i = 0; i != NumRegs; ++i)
+      if (!isAllocated(Regs[i]))
+        return i;
+    return NumRegs;
+  }
+
+  /// AllocateReg - Attempt to allocate one register.  If it is not available,
+  /// return zero.  Otherwise, return the register, marking it and any aliases
+  /// as allocated.
+  unsigned AllocateReg(unsigned Reg) {
+    if (isAllocated(Reg)) return 0;
+    MarkAllocated(Reg);
+    return Reg;
+  }
+
+  /// Version of AllocateReg with extra register to be shadowed.
+  unsigned AllocateReg(unsigned Reg, unsigned ShadowReg) {
+    if (isAllocated(Reg)) return 0;
+    MarkAllocated(Reg);
+    MarkAllocated(ShadowReg);
+    return Reg;
+  }
+
+  /// AllocateReg - Attempt to allocate one of the specified registers.  If none
+  /// are available, return zero.  Otherwise, return the first one available,
+  /// marking it and any aliases as allocated.
+  unsigned AllocateReg(const unsigned *Regs, unsigned NumRegs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
+    if (FirstUnalloc == NumRegs)
+      return 0;    // Didn't find the reg.
+
+    // Mark the register and any aliases as allocated.
+    unsigned Reg = Regs[FirstUnalloc];
+    MarkAllocated(Reg);
+    return Reg;
+  }
+
+  /// Version of AllocateReg with list of registers to be shadowed.
+  unsigned AllocateReg(const unsigned *Regs, const unsigned *ShadowRegs,
+                       unsigned NumRegs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
+    if (FirstUnalloc == NumRegs)
+      return 0;    // Didn't find the reg.
+
+    // Mark the register and any aliases as allocated.
+    unsigned Reg = Regs[FirstUnalloc], ShadowReg = ShadowRegs[FirstUnalloc];
+    MarkAllocated(Reg);
+    MarkAllocated(ShadowReg);
+    return Reg;
+  }
+
+  /// AllocateStack - Allocate a chunk of stack space with the specified size
+  /// and alignment.
+  unsigned AllocateStack(unsigned Size, unsigned Align) {
+    assert(Align && ((Align-1) & Align) == 0); // Align is power of 2.
+    StackOffset = ((StackOffset + Align-1) & ~(Align-1));
+    unsigned Result = StackOffset;
+    StackOffset += Size;
+    return Result;
+  }
+
+  // HandleByVal - Allocate a stack slot large enough to pass an argument by
+  // value. The size and alignment information of the argument is encoded in its
+  // parameter attribute.
+  void HandleByVal(unsigned ValNo, EVT ValVT,
+                   EVT LocVT, CCValAssign::LocInfo LocInfo,
+                   int MinSize, int MinAlign, ISD::ArgFlagsTy ArgFlags);
+
+private:
+  /// MarkAllocated - Mark a register and all of its aliases as allocated.
+  void MarkAllocated(unsigned Reg);
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
new file mode 100644
index 0000000..cb73ae0
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -0,0 +1,184 @@
+//===--- HexagonExpandPredSpillCode.cpp - Expand Predicate Spill Code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===////
+// The Hexagon processor has no instructions that load or store predicate
+// registers directly.  So, when these registers must be spilled a general 
+// purpose register must be found and the value copied to/from it from/to 
+// the predicate register.  This code currently does not use the register 
+// scavenger mechanism available in the allocator.  There are two registers
+// reserved to allow spilling/restoring predicate registers.  One is used to
+// hold the predicate value.  The other is used when stack frame offsets are
+// too large.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include <map>
+#include <iostream>
+
+#include "llvm/Support/CommandLine.h"
+
+
+using namespace llvm;
+
+
+namespace {
+
+class HexagonExpandPredSpillCode : public MachineFunctionPass {
+    HexagonTargetMachine& QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonExpandPredSpillCode(HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+
+    const char *getPassName() const {
+      return "Hexagon Expand Predicate Spill Code";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonExpandPredSpillCode::ID = 0;
+
+
+bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
+
+  const HexagonInstrInfo *TII = QTM.getInstrInfo();
+  const HexagonRegisterInfo *RegInfo = QTM.getRegisterInfo();
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (Opc == Hexagon::STriw_pred) {
+        // STriw_pred [R30], ofst, SrcReg;
+        unsigned FP = MI->getOperand(0).getReg();
+        assert(FP == RegInfo->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(MI->getOperand(1).isImm() && "Not an offset");
+        int Offset = MI->getOperand(1).getImm();
+        int SrcReg = MI->getOperand(2).getReg();
+        assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
+               "Not a predicate register");
+        if (!TII->isValidOffset(Hexagon::STriw, Offset)) {
+          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::CONST32_Int_Real),
+                      HEXAGON_RESERVED_REG_1).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+                    HEXAGON_RESERVED_REG_1)
+              .addReg(FP).addReg(HEXAGON_RESERVED_REG_1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::STriw))
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0).addReg(HEXAGON_RESERVED_REG_2);
+          } else {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw))
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0)
+              .addReg(HEXAGON_RESERVED_REG_2);
+          }
+        } else {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                    HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw)).
+                    addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2);
+        }
+        MII = MBB->erase(MI);
+        --MII;
+      } else if (Opc == Hexagon::LDriw_pred) {
+        // DstReg = LDriw_pred [R30], ofst.
+        int DstReg = MI->getOperand(0).getReg();
+        assert(Hexagon::PredRegsRegClass.contains(DstReg) &&
+               "Not a predicate register");
+        unsigned FP = MI->getOperand(1).getReg();
+        assert(FP == RegInfo->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(MI->getOperand(2).isImm() && "Not an offset");
+        int Offset = MI->getOperand(2).getImm();
+        if (!TII->isValidOffset(Hexagon::LDriw, Offset)) {
+          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::CONST32_Int_Real),
+                      HEXAGON_RESERVED_REG_1).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+                    HEXAGON_RESERVED_REG_1)
+              .addReg(FP)
+              .addReg(HEXAGON_RESERVED_REG_1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                      HEXAGON_RESERVED_REG_2)
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
+          } else {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                      HEXAGON_RESERVED_REG_2)
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
+          }
+        } else {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                    HEXAGON_RESERVED_REG_2).addReg(FP).addImm(Offset);
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                    DstReg).addReg(HEXAGON_RESERVED_REG_2);
+        }
+        MII = MBB->erase(MI);
+        --MII;
+      }
+    }
+  }
+
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonExpandPredSpillCode(HexagonTargetMachine &TM) {
+  return new HexagonExpandPredSpillCode(TM);
+}
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
new file mode 100644
index 0000000..78e0b1c
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -0,0 +1,333 @@
+//==-- HexagonFrameLowering.cpp - Define frame lowering         --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonFrameLowering.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <iostream>
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Function.h"
+using namespace llvm;
+
+static cl::opt<bool> DisableDeallocRet(
+                       "disable-hexagon-dealloc-ret",
+                       cl::Hidden,
+                       cl::desc("Disable Dealloc Return for Hexagon target"));
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void HexagonFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get the alignments provided by the target.
+  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = RoundUpToAlignment(maxCallFrameSize, TargetAlign);
+
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = RoundUpToAlignment(FrameSize, TargetAlign);
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+
+void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const HexagonRegisterInfo *QRI =
+    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  determineFrameLayout(MF);
+
+  // Check if frame moves are needed for EH.
+  bool needsFrameMoves = MMI.hasDebugInfo() ||
+    !MF.getFunction()->needsUnwindTableEntry();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  int NumBytes = (int) MFI->getStackSize();
+
+  // LLVM expects allocframe not to be the first instruction in the
+  // basic block.
+  MachineBasicBlock::iterator InsertPt = MBB.begin();
+
+  //
+  // ALLOCA adjust regs.  Iterate over ADJDYNALLOC nodes and change the offset.
+  //
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  const std::vector<MachineInstr*>& AdjustRegs =
+    FuncInfo->getAllocaAdjustInsts();
+  for (std::vector<MachineInstr*>::const_iterator i = AdjustRegs.begin(),
+         e = AdjustRegs.end();
+       i != e; ++i) {
+    MachineInstr* MI = *i;
+    assert((MI->getOpcode() == Hexagon::ADJDYNALLOC) &&
+           "Expected adjust alloca node");
+
+    MachineOperand& MO = MI->getOperand(2);
+    assert(MO.isImm() && "Expected immediate");
+    MO.setImm(MFI->getMaxCallFrameSize());
+  }
+
+ std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+
+ if (needsFrameMoves) {
+   // Advance CFA. DW_CFA_def_cfa
+   unsigned FPReg = QRI->getFrameRegister();
+   unsigned RAReg = QRI->getRARegister();
+
+   MachineLocation Dst(MachineLocation::VirtualFP);
+   MachineLocation Src(FPReg, -8);
+   Moves.push_back(MachineMove(0, Dst, Src));
+
+   // R31 = (R31 - #4)
+   MachineLocation LRDst(RAReg, -4);
+   MachineLocation LRSrc(RAReg);
+   Moves.push_back(MachineMove(0, LRDst, LRSrc));
+
+   // R30 = (R30 - #8)
+   MachineLocation SPDst(FPReg, -8);
+   MachineLocation SPSrc(FPReg);
+   Moves.push_back(MachineMove(0, SPDst, SPSrc));
+ }
+
+  //
+  // Only insert ALLOCFRAME if we need to.
+  //
+  if (hasFP(MF)) {
+    // Check for overflow.
+    // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+    const int ALLOCFRAME_MAX = 16384;
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+    if (NumBytes >= ALLOCFRAME_MAX) {
+      // Emit allocframe(#0).
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(0);
+
+      // Subtract offset from frame pointer.
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::CONST32_Int_Real),
+                                      HEXAGON_RESERVED_REG_1).addImm(NumBytes);
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::SUB_rr),
+                                      QRI->getStackRegister()).
+                                      addReg(QRI->getStackRegister()).
+                                      addReg(HEXAGON_RESERVED_REG_1);
+    } else {
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(NumBytes);
+    }
+  }
+}
+// Returns true if MBB has a machine instructions that indicates a tail call
+// in the block.
+bool HexagonFrameLowering::hasTailCall(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  return RetOpcode == Hexagon::TCRETURNtg || RetOpcode == Hexagon::TCRETURNtext;}
+
+void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+  //
+  // Only insert deallocframe if we need to.
+  //
+  if (hasFP(MF)) {
+    MachineBasicBlock::iterator MBBI = prior(MBB.end());
+    MachineBasicBlock::iterator MBBI_end = MBB.end();
+    //
+    // For Hexagon, we don't need the frame size.
+    //
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    int NumBytes = (int) MFI->getStackSize();
+
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+    // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher
+    // versions.
+    if (STI.hasV4TOps() && MBBI->getOpcode() == Hexagon::JMPR
+                        && !DisableDeallocRet) {
+      // Remove jumpr node.
+      MBB.erase(MBBI);
+      // Add dealloc_return.
+      BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4))
+        .addImm(NumBytes);
+    } else { // Add deallocframe for V2 and V3.
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME)).addImm(NumBytes);
+    }
+  }
+}
+
+bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  return (MFI->hasCalls() || (MFI->getStackSize() > 0) ||
+          FuncInfo->hasClobberLR() );
+}
+
+bool
+HexagonFrameLowering::spillCalleeSavedRegisters(
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  if (CSI.empty()) {
+    return false;
+  }
+
+  // We can only schedule double loads if we spill contiguous callee-saved regs
+  // For instance, we cannot scheduled double-word loads if we spill r24,
+  // r26, and r27.
+  // Hexagon_TODO: We can try to double-word align odd registers for -O2 and
+  // above.
+  bool ContiguousRegs = true;
+
+  for (unsigned i = 0; i < CSI.size(); ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    //
+    // Check if we can use a double-word store.
+    //
+    const unsigned* SuperReg = TRI->getSuperRegisters(Reg);
+
+    // Assume that there is exactly one superreg.
+    assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg");
+    bool CanUseDblStore = false;
+    const TargetRegisterClass* SuperRegClass = 0;
+
+    if (ContiguousRegs && (i < CSI.size()-1)) {
+      const unsigned* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg());
+      assert(SuperRegNext[0] && !SuperRegNext[1] &&
+             "Expected exactly one superreg");
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]);
+      CanUseDblStore = (SuperRegNext[0] == SuperReg[0]);
+    }
+
+
+    if (CanUseDblStore) {
+      TII.storeRegToStackSlot(MBB, MI, SuperReg[0], true,
+                              CSI[i+1].getFrameIdx(), SuperRegClass, TRI);
+      MBB.addLiveIn(SuperReg[0]);
+      ++i;
+    } else {
+      // Cannot use a double-word store.
+      ContiguousRegs = false;
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i].getFrameIdx(), RC,
+                              TRI);
+      MBB.addLiveIn(Reg);
+    }
+  }
+  return true;
+}
+
+
+bool HexagonFrameLowering::restoreCalleeSavedRegisters(
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  if (CSI.empty()) {
+    return false;
+  }
+
+  // We can only schedule double loads if we spill contiguous callee-saved regs
+  // For instance, we cannot scheduled double-word loads if we spill r24,
+  // r26, and r27.
+  // Hexagon_TODO: We can try to double-word align odd registers for -O2 and
+  // above.
+  bool ContiguousRegs = true;
+
+  for (unsigned i = 0; i < CSI.size(); ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    //
+    // Check if we can use a double-word load.
+    //
+    const unsigned* SuperReg = TRI->getSuperRegisters(Reg);
+    const TargetRegisterClass* SuperRegClass = 0;
+
+    // Assume that there is exactly one superreg.
+    assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg");
+    bool CanUseDblLoad = false;
+    if (ContiguousRegs && (i < CSI.size()-1)) {
+      const unsigned* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg());
+      assert(SuperRegNext[0] && !SuperRegNext[1] &&
+             "Expected exactly one superreg");
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]);
+      CanUseDblLoad = (SuperRegNext[0] == SuperReg[0]);
+    }
+
+
+    if (CanUseDblLoad) {
+      TII.loadRegFromStackSlot(MBB, MI, SuperReg[0], CSI[i+1].getFrameIdx(),
+                               SuperRegClass, TRI);
+      MBB.addLiveIn(SuperReg[0]);
+      ++i;
+    } else {
+      // Cannot use a double-word load.
+      ContiguousRegs = false;
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+      MBB.addLiveIn(Reg);
+    }
+  }
+  return true;
+}
+
+int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  return MF.getFrameInfo()->getObjectOffset(FI);
+}
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
new file mode 100644
index 0000000..ad87f11
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -0,0 +1,50 @@
+//=- HexagonFrameLowering.h - Define frame lowering for Hexagon --*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGON_FRAMEINFO_H
+#define HEXAGON_FRAMEINFO_H
+
+#include "Hexagon.h"
+#include "HexagonSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class HexagonFrameLowering : public TargetFrameLowering {
+private:
+  const HexagonSubtarget &STI;
+  void determineFrameLayout(MachineFunction &MF) const;
+
+public:
+  explicit HexagonFrameLowering(const HexagonSubtarget &sti)
+    : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  virtual bool
+  spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const;
+  virtual bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const std::vector<CalleeSavedInfo> &CSI,
+                              const TargetRegisterInfo *TRI) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasTailCall(MachineBasicBlock &MBB) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
new file mode 100644
index 0000000..c1abc4a
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -0,0 +1,644 @@
+//===-- HexagonHardwareLoops.cpp - Identify and generate hardware loops ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the Hexagon hardware
+// loop instruction.  The hardware loop can perform loop branches with a
+// zero-cycle overhead.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for hardware loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Assumes loops are normalized by IndVarSimplify
+//  - Try inner-most loops first
+//  - No nested hardware loops.
+//  - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hwloops"
+#include "llvm/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace {
+  class CountValue;
+  struct HexagonHardwareLoops : public MachineFunctionPass {
+    MachineLoopInfo       *MLI;
+    MachineRegisterInfo   *MRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID;   // Pass identification, replacement for typeid
+
+    HexagonHardwareLoops() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "Hexagon Hardware Loops"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// getCanonicalInductionVariable - Check to see if the loop has a canonical
+    /// induction variable.
+    /// Should be defined in MachineLoop. Based upon version in class Loop.
+    const MachineInstr *getCanonicalInductionVariable(MachineLoop *L) const;
+
+    /// getTripCount - Return a loop-invariant LLVM register indicating the
+    /// number of times the loop will be executed.  If the trip-count cannot
+    /// be determined, this return null.
+    CountValue *getTripCount(MachineLoop *L) const;
+
+    /// isInductionOperation - Return true if the instruction matches the
+    /// pattern for an opertion that defines an induction variable.
+    bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const;
+
+    /// isInvalidOperation - Return true if the instruction is not valid within
+    /// a hardware loop.
+    bool isInvalidLoopOperation(const MachineInstr *MI) const;
+
+    /// containsInavlidInstruction - Return true if the loop contains an
+    /// instruction that inhibits using the hardware loop.
+    bool containsInvalidInstruction(MachineLoop *L) const;
+
+    /// converToHardwareLoop - Given a loop, check if we can convert it to a
+    /// hardware loop.  If so, then perform the conversion and return true.
+    bool convertToHardwareLoop(MachineLoop *L);
+
+  };
+
+  char HexagonHardwareLoops::ID = 0;
+
+
+  // CountValue class - Abstraction for a trip count of a loop. A
+  // smaller vesrsion of the MachineOperand class without the concerns
+  // of changing the operand representation.
+  class CountValue {
+  public:
+    enum CountValueType {
+      CV_Register,
+      CV_Immediate
+    };
+  private:
+    CountValueType Kind;
+    union Values {
+      unsigned RegNum;
+      int64_t ImmVal;
+      Values(unsigned r) : RegNum(r) {}
+      Values(int64_t i) : ImmVal(i) {}
+    } Contents;
+    bool isNegative;
+
+  public:
+    CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r),
+                                       isNegative(neg) {}
+    explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i),
+                                     isNegative(i < 0) {}
+    CountValueType getType() const { return Kind; }
+    bool isReg() const { return Kind == CV_Register; }
+    bool isImm() const { return Kind == CV_Immediate; }
+    bool isNeg() const { return isNegative; }
+
+    unsigned getReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.RegNum;
+    }
+    void setReg(unsigned Val) {
+      Contents.RegNum = Val;
+    }
+    int64_t getImm() const {
+      assert(isImm() && "Wrong CountValue accessor");
+      if (isNegative) {
+        return -Contents.ImmVal;
+      }
+      return Contents.ImmVal;
+    }
+    void setImm(int64_t Val) {
+      Contents.ImmVal = Val;
+    }
+
+    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
+      if (isReg()) { OS << PrintReg(getReg()); }
+      if (isImm()) { OS << getImm(); }
+    }
+  };
+
+  struct HexagonFixupHwLoops : public MachineFunctionPass {
+  public:
+    static char ID;     // Pass identification, replacement for typeid.
+
+    HexagonFixupHwLoops() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "Hexagon Hardware Loop Fixup"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// Maximum distance between the loop instr and the basic block.
+    /// Just an estimate.
+    static const unsigned MAX_LOOP_DISTANCE = 200;
+
+    /// fixupLoopInstrs - Check the offset between each loop instruction and
+    /// the loop basic block to determine if we can use the LOOP instruction
+    /// or if we need to set the LC/SA registers explicitly.
+    bool fixupLoopInstrs(MachineFunction &MF);
+
+    /// convertLoopInstr - Add the instruction to set the LC and SA registers
+    /// explicitly.
+    void convertLoopInstr(MachineFunction &MF,
+                          MachineBasicBlock::iterator &MII,
+                          RegScavenger &RS);
+
+  };
+
+  char HexagonFixupHwLoops::ID = 0;
+
+} // end anonymous namespace
+
+
+/// isHardwareLoop - Returns true if the instruction is a hardware loop
+/// instruction.
+static bool isHardwareLoop(const MachineInstr *MI) {
+  return MI->getOpcode() == Hexagon::LOOP0_r ||
+    MI->getOpcode() == Hexagon::LOOP0_i;
+}
+
+/// isCompareEquals - Returns true if the instruction is a compare equals
+/// instruction with an immediate operand.
+static bool isCompareEqualsImm(const MachineInstr *MI) {
+  return MI->getOpcode() == Hexagon::CMPEQri;
+}
+
+
+/// createHexagonHardwareLoops - Factory for creating
+/// the hardware loop phase.
+FunctionPass *llvm::createHexagonHardwareLoops() {
+  return new HexagonHardwareLoops();
+}
+
+
+bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+
+  bool Changed = false;
+
+  // get the loop information
+  MLI = &getAnalysis<MachineLoopInfo>();
+  // get the register information
+  MRI = &MF.getRegInfo();
+  // the target specific instructio info.
+  TII = MF.getTarget().getInstrInfo();
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I) {
+    MachineLoop *L = *I;
+    if (!L->getParentLoop()) {
+      Changed |= convertToHardwareLoop(L);
+    }
+  }
+
+  return Changed;
+}
+
+/// getCanonicalInductionVariable - Check to see if the loop has a canonical
+/// induction variable. We check for a simple recurrence pattern - an
+/// integer recurrence that decrements by one each time through the loop and
+/// ends at zero.  If so, return the phi node that corresponds to it.
+///
+/// Based upon the similar code in LoopInfo except this code is specific to
+/// the machine.
+/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
+///
+const MachineInstr
+*HexagonHardwareLoops::getCanonicalInductionVariable(MachineLoop *L) const {
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+  assert(PI != TopMBB->pred_end() &&
+         "Loop must have more than one incoming edge!");
+  MachineBasicBlock *Backedge = *PI++;
+  if (PI == TopMBB->pred_end()) return 0;  // dead loop
+  MachineBasicBlock *Incoming = *PI++;
+  if (PI != TopMBB->pred_end()) return 0;  // multiple backedges?
+
+  // make sure there is one incoming and one backedge and determine which
+  // is which.
+  if (L->contains(Incoming)) {
+    if (L->contains(Backedge))
+      return 0;
+    std::swap(Incoming, Backedge);
+  } else if (!L->contains(Backedge))
+    return 0;
+
+  // Loop over all of the PHI nodes, looking for a canonical induction variable:
+  //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
+  //   - The recurrence comes from the backedge.
+  //   - the definition is an induction operatio.n
+  for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end();
+       I != E && I->isPHI(); ++I) {
+    const MachineInstr *MPhi = &*I;
+    unsigned DefReg = MPhi->getOperand(0).getReg();
+    for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+      // Check each operand for the value from the backedge.
+      MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB();
+      if (L->contains(MBB)) { // operands comes from the backedge
+        // Check if the definition is an induction operation.
+        const MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
+        if (isInductionOperation(DI, DefReg)) {
+          return MPhi;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+/// getTripCount - Return a loop-invariant LLVM value indicating the
+/// number of times the loop will be executed.  The trip count can
+/// be either a register or a constant value.  If the trip-count
+/// cannot be determined, this returns null.
+///
+/// We find the trip count from the phi instruction that defines the
+/// induction variable.  We follow the links to the CMP instruction
+/// to get the trip count.
+///
+/// Based upon getTripCount in LoopInfo.
+///
+CountValue *HexagonHardwareLoops::getTripCount(MachineLoop *L) const {
+  // Check that the loop has a induction variable.
+  const MachineInstr *IV_Inst = getCanonicalInductionVariable(L);
+  if (IV_Inst == 0) return 0;
+
+  // Canonical loops will end with a 'cmpeq_ri IV, Imm',
+  //  if Imm is 0, get the count from the PHI opnd
+  //  if Imm is -M, than M is the count
+  //  Otherwise, Imm is the count
+  const MachineOperand *IV_Opnd;
+  const MachineOperand *InitialValue;
+  if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
+    InitialValue = &IV_Inst->getOperand(1);
+    IV_Opnd = &IV_Inst->getOperand(3);
+  } else {
+    InitialValue = &IV_Inst->getOperand(3);
+    IV_Opnd = &IV_Inst->getOperand(1);
+  }
+
+  // Look for the cmp instruction to determine if we
+  // can get a useful trip count.  The trip count can
+  // be either a register or an immediate.  The location
+  // of the value depends upon the type (reg or imm).
+  while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) {
+    const MachineInstr *MI = IV_Opnd->getParent();
+    if (L->contains(MI) && isCompareEqualsImm(MI)) {
+      const MachineOperand &MO = MI->getOperand(2);
+      assert(MO.isImm() && "IV Cmp Operand should be 0");
+      int64_t ImmVal = MO.getImm();
+
+      const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
+      assert(L->contains(IV_DefInstr->getParent()) &&
+             "IV definition should occurs in loop");
+      int64_t iv_value = IV_DefInstr->getOperand(2).getImm();
+
+      if (ImmVal == 0) {
+        // Make sure the induction variable changes by one on each iteration.
+        if (iv_value != 1 && iv_value != -1) {
+          return 0;
+        }
+        return new CountValue(InitialValue->getReg(), iv_value > 0);
+      } else {
+        assert(InitialValue->isReg() && "Expecting register for init value");
+        const MachineInstr *DefInstr = MRI->getVRegDef(InitialValue->getReg());
+        if (DefInstr && DefInstr->getOpcode() == Hexagon::TFRI) {
+          int64_t count = ImmVal - DefInstr->getOperand(1).getImm();
+          if ((count % iv_value) != 0) {
+            return 0;
+          }
+          return new CountValue(count/iv_value);
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+/// isInductionOperation - return true if the operation is matches the
+/// pattern that defines an induction variable:
+///    add iv, c
+///
+bool
+HexagonHardwareLoops::isInductionOperation(const MachineInstr *MI,
+                                           unsigned IVReg) const {
+  return (MI->getOpcode() ==
+          Hexagon::ADD_ri && MI->getOperand(1).getReg() == IVReg);
+}
+
+/// isInvalidOperation - Return true if the operation is invalid within
+/// hardware loop.
+bool
+HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI) const {
+
+  // call is not allowed because the callee may use a hardware loop
+  if (MI->getDesc().isCall()) {
+    return true;
+  }
+  // do not allow nested hardware loops
+  if (isHardwareLoop(MI)) {
+    return true;
+  }
+  // check if the instruction defines a hardware loop register
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        (MO.getReg() == Hexagon::LC0 || MO.getReg() == Hexagon::LC1 ||
+         MO.getReg() == Hexagon::SA0 || MO.getReg() == Hexagon::SA0)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// containsInvalidInstruction - Return true if the loop contains
+/// an instruction that inhibits the use of the hardware loop function.
+///
+bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    for (MachineBasicBlock::iterator
+           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      if (isInvalidLoopOperation(MI)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// converToHardwareLoop - check if the loop is a candidate for
+/// converting to a hardware loop.  If so, then perform the
+/// transformation.
+///
+/// This function works on innermost loops first.  A loop can
+/// be converted if it is a counting loop; either a register
+/// value or an immediate.
+///
+/// The code makes several assumptions about the representation
+/// of the loop in llvm.
+bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
+  bool Changed = false;
+  // Process nested loops first.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    Changed |= convertToHardwareLoop(*I);
+  }
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (Changed) {
+    return Changed;
+  }
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getTripCount(L);
+  if (TripCount == 0) {
+    return false;
+  }
+  // Does the loop contain any invalid instructions?
+  if (containsInvalidInstruction(L)) {
+    return false;
+  }
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  // No preheader means there's not place for the loop instr.
+  if (Preheader == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate hw loop if the loop has more than one exit.
+  if (LastMBB == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+
+  // Determine the loop start.
+  MachineBasicBlock *LoopStart = L->getTopBlock();
+  if (L->getLoopLatch() != LastMBB) {
+    // When the exit and latch are not the same, use the latch block as the
+    // start.
+    // The loop start address is used only after the 1st iteration, and the loop
+    // latch may contains instrs. that need to be executed after the 1st iter.
+    LoopStart = L->getLoopLatch();
+    // Make sure the latch is a successor of the exit, otherwise it won't work.
+    if (!LastMBB->isSuccessor(LoopStart)) {
+      return false;
+    }
+  }
+
+  // Convert the loop to a hardware loop
+  DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
+
+  if (TripCount->isReg()) {
+    // Create a copy of the loop count register.
+    MachineFunction *MF = LastMBB->getParent();
+    const TargetRegisterClass *RC =
+      MF->getRegInfo().getRegClass(TripCount->getReg());
+    unsigned CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    BuildMI(*Preheader, InsertPos, InsertPos->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), CountReg).addReg(TripCount->getReg());
+    if (TripCount->isNeg()) {
+      unsigned CountReg1 = CountReg;
+      CountReg = MF->getRegInfo().createVirtualRegister(RC);
+      BuildMI(*Preheader, InsertPos, InsertPos->getDebugLoc(),
+              TII->get(Hexagon::NEG), CountReg).addReg(CountReg1);
+    }
+
+    // Add the Loop instruction to the begining of the loop.
+    BuildMI(*Preheader, InsertPos, InsertPos->getDebugLoc(),
+            TII->get(Hexagon::LOOP0_r)).addMBB(LoopStart).addReg(CountReg);
+  } else {
+    assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
+    // Add the Loop immediate instruction to the beginning of the loop.
+    int64_t CountImm = TripCount->getImm();
+    BuildMI(*Preheader, InsertPos, InsertPos->getDebugLoc(),
+            TII->get(Hexagon::LOOP0_i)).addMBB(LoopStart).addImm(CountImm);
+  }
+
+  // Make sure the loop start always has a reference in the CFG.  We need to
+  // create a BlockAddress operand to get this mechanism to work both the
+  // MachineBasicBlock and BasicBlock objects need the flag set.
+  LoopStart->setHasAddressTaken();
+  // This line is needed to set the hasAddressTaken flag on the BasicBlock
+  // object
+  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+  // Replace the loop branch with an endloop instruction.
+  DebugLoc dl = LastI->getDebugLoc();
+  BuildMI(*LastMBB, LastI, dl, TII->get(Hexagon::ENDLOOP0)).addMBB(LoopStart);
+
+  // The loop ends with either:
+  //  - a conditional branch followed by an unconditional branch, or
+  //  - a conditional branch to the loop start.
+  if (LastI->getOpcode() == Hexagon::JMP_Pred ||
+      LastI->getOpcode() == Hexagon::JMP_PredNot) {
+    // delete one and change/add an uncond. branch to out of the loop
+    MachineBasicBlock *BranchTarget = LastI->getOperand(1).getMBB();
+    LastI = LastMBB->erase(LastI);
+    if (!L->contains(BranchTarget)) {
+      if (LastI != LastMBB->end()) {
+        TII->RemoveBranch(*LastMBB);
+      }
+      SmallVector<MachineOperand, 0> Cond;
+      TII->InsertBranch(*LastMBB, BranchTarget, 0, Cond, dl);
+    }
+  } else {
+    // Conditional branch to loop start; just delete it.
+    LastMBB->erase(LastI);
+  }
+  delete TripCount;
+
+  ++NumHWLoops;
+  return true;
+}
+
+/// createHexagonFixupHwLoops - Factory for creating the hardware loop
+/// phase.
+FunctionPass *llvm::createHexagonFixupHwLoops() {
+  return new HexagonFixupHwLoops();
+}
+
+bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "****** Hexagon Hardware Loop Fixup ******\n");
+
+  bool Changed = fixupLoopInstrs(MF);
+  return Changed;
+}
+
+/// fixupLoopInsts - For Hexagon, if the loop label is to far from the
+/// loop instruction then we need to set the LC0 and SA0 registers
+/// explicitly instead of using LOOP(start,count).  This function
+/// checks the distance, and generates register assignments if needed.
+///
+/// This function makes two passes over the basic blocks.  The first
+/// pass computes the offset of the basic block from the start.
+/// The second pass checks all the loop instructions.
+bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
+
+  // Offset of the current instruction from the start.
+  unsigned InstOffset = 0;
+  // Map for each basic block to it's first instruction.
+  DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
+
+  // First pass - compute the offset of each basic block.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    BlockToInstOffset[MBB] = InstOffset;
+    InstOffset += (MBB->size() * 4);
+  }
+
+  // Second pass - check each loop instruction to see if it needs to
+  // be converted.
+  InstOffset = 0;
+  bool Changed = false;
+  RegScavenger RS;
+
+  // Loop over all the basic blocks.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    InstOffset = BlockToInstOffset[MBB];
+    RS.enterBasicBlock(MBB);
+
+    // Loop over all the instructions.
+    MachineBasicBlock::iterator MIE = MBB->end();
+    MachineBasicBlock::iterator MII = MBB->begin();
+    while (MII != MIE) {
+      if (isHardwareLoop(MII)) {
+        RS.forward(MII);
+        assert(MII->getOperand(0).isMBB() &&
+               "Expect a basic block as loop operand");
+        int diff = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
+        diff = (diff > 0 ? diff : -diff);
+        if ((unsigned)diff > MAX_LOOP_DISTANCE) {
+          // Convert to explicity setting LC0 and SA0.
+          convertLoopInstr(MF, MII, RS);
+          MII = MBB->erase(MII);
+          Changed = true;
+        } else {
+          ++MII;
+        }
+      } else {
+        ++MII;
+      }
+      InstOffset += 4;
+    }
+  }
+
+  return Changed;
+
+}
+
+/// convertLoopInstr - convert a loop instruction to a sequence of instructions
+/// that set the lc and sa register explicitly.
+void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
+                                           MachineBasicBlock::iterator &MII,
+                                           RegScavenger &RS) {
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  MachineBasicBlock *MBB = MII->getParent();
+  DebugLoc DL = MII->getDebugLoc();
+  unsigned Scratch = RS.scavengeRegister(Hexagon::IntRegsRegisterClass, MII, 0);
+
+  // First, set the LC0 with the trip count.
+  if (MII->getOperand(1).isReg()) {
+    // Trip count is a register
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(MII->getOperand(1).getReg());
+  } else {
+    // Trip count is an immediate.
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFRI), Scratch)
+      .addImm(MII->getOperand(1).getImm());
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(Scratch);
+  }
+  // Then, set the SA0 with the loop start address.
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::CONST32_Label), Scratch)
+    .addMBB(MII->getOperand(0).getMBB());
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::SA0).addReg(Scratch);
+}
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
new file mode 100644
index 0000000..4deab9f
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -0,0 +1,1495 @@
+//==-- HexagonISelDAGToDAG.cpp - A dag to dag inst selector for Hexagon ----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-isel"
+#include "HexagonISelLowering.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// HexagonDAGToDAGISel - Hexagon specific code to select Hexagon machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class HexagonDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Hexagon Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const HexagonSubtarget &Subtarget;
+
+  // Keep a reference to HexagonTargetMachine.
+  HexagonTargetMachine& TM;
+  const HexagonInstrInfo *TII;
+
+public:
+  explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine)
+    : SelectionDAGISel(targetmachine),
+      Subtarget(targetmachine.getSubtarget<HexagonSubtarget>()),
+      TM(targetmachine),
+      TII(static_cast<const HexagonInstrInfo*>(TM.getInstrInfo())) {
+
+  }
+
+  SDNode *Select(SDNode *N);
+
+  // Complex Pattern Selectors.
+  bool SelectADDRri(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_0(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_1(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_2(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectMEMriS11_2(SDValue& Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRriS11_3(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRrr(SDValue &Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRriU6_0(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriU6_1(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriU6_2(SDValue& N, SDValue &R1, SDValue &R2);
+
+  virtual const char *getPassName() const {
+    return "Hexagon DAG->DAG Pattern Instruction Selection";
+  }
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+  bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  SDNode *SelectLoad(SDNode *N);
+  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl);
+  SDNode *SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl);
+  SDNode *SelectIndexedLoadZeroExtend64(LoadSDNode *LD, unsigned Opcode,
+                                        DebugLoc dl);
+  SDNode *SelectIndexedLoadSignExtend64(LoadSDNode *LD, unsigned Opcode,
+                                        DebugLoc dl);
+  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, DebugLoc dl);
+  SDNode *SelectIndexedStore(StoreSDNode *ST, DebugLoc dl);
+  SDNode *SelectStore(SDNode *N);
+  SDNode *SelectSHL(SDNode *N);
+  SDNode *SelectSelect(SDNode *N);
+  SDNode *SelectTruncate(SDNode *N);
+  SDNode *SelectMul(SDNode *N);
+  SDNode *SelectZeroExtend(SDNode *N);
+  SDNode *SelectIntrinsicWOChain(SDNode *N);
+  SDNode *SelectConstant(SDNode *N);
+  SDNode *SelectAdd(SDNode *N);
+
+  // Include the pieces autogenerated from the target description.
+#include "HexagonGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+
+/// createHexagonISelDag - This pass converts a legalized DAG into a
+/// Hexagon-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM) {
+  return new HexagonDAGToDAGISel(TM);
+}
+
+static bool IsS11_0_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<11>(v);
+}
+
+
+static bool IsS11_1_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,1>(v);
+}
+
+
+static bool IsS11_2_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,2>(v);
+}
+
+
+static bool IsS11_3_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,3>(v);
+}
+
+
+static bool IsU6_0_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}
+
+
+static bool IsU6_1_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,1>(v);
+}
+
+
+static bool IsU6_2_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,2>(v);
+}
+
+
+// Intrinsics that return a a predicate.
+static unsigned doesIntrinsicReturnPredicate(unsigned ID)
+{
+  switch (ID) {
+    default:
+      return 0;
+    case Intrinsic::hexagon_C2_cmpeq:
+    case Intrinsic::hexagon_C2_cmpgt:
+    case Intrinsic::hexagon_C2_cmpgtu:
+    case Intrinsic::hexagon_C2_cmpgtup:
+    case Intrinsic::hexagon_C2_cmpgtp:
+    case Intrinsic::hexagon_C2_cmpeqp:
+    case Intrinsic::hexagon_C2_bitsset:
+    case Intrinsic::hexagon_C2_bitsclr:
+    case Intrinsic::hexagon_C2_cmpeqi:
+    case Intrinsic::hexagon_C2_cmpgti:
+    case Intrinsic::hexagon_C2_cmpgtui:
+    case Intrinsic::hexagon_C2_cmpgei:
+    case Intrinsic::hexagon_C2_cmpgeui:
+    case Intrinsic::hexagon_C2_cmplt:
+    case Intrinsic::hexagon_C2_cmpltu:
+    case Intrinsic::hexagon_C2_bitsclri:
+    case Intrinsic::hexagon_C2_and:
+    case Intrinsic::hexagon_C2_or:
+    case Intrinsic::hexagon_C2_xor:
+    case Intrinsic::hexagon_C2_andn:
+    case Intrinsic::hexagon_C2_not:
+    case Intrinsic::hexagon_C2_orn:
+    case Intrinsic::hexagon_C2_pxfer_map:
+    case Intrinsic::hexagon_C2_any8:
+    case Intrinsic::hexagon_C2_all8:
+    case Intrinsic::hexagon_A2_vcmpbeq:
+    case Intrinsic::hexagon_A2_vcmpbgtu:
+    case Intrinsic::hexagon_A2_vcmpheq:
+    case Intrinsic::hexagon_A2_vcmphgt:
+    case Intrinsic::hexagon_A2_vcmphgtu:
+    case Intrinsic::hexagon_A2_vcmpweq:
+    case Intrinsic::hexagon_A2_vcmpwgt:
+    case Intrinsic::hexagon_A2_vcmpwgtu:
+    case Intrinsic::hexagon_C2_tfrrp:
+    case Intrinsic::hexagon_S2_tstbit_i:
+    case Intrinsic::hexagon_S2_tstbit_r:
+      return 1;
+  }
+}
+
+
+// Intrinsics that have predicate operands.
+static unsigned doesIntrinsicContainPredicate(unsigned ID)
+{
+  switch (ID) {
+    default:
+      return 0;
+    case Intrinsic::hexagon_C2_tfrpr:
+      return Hexagon::TFR_RsPd;
+    case Intrinsic::hexagon_C2_and:
+      return Hexagon::AND_pp;
+    case Intrinsic::hexagon_C2_xor:
+      return Hexagon::XOR_pp;
+    case Intrinsic::hexagon_C2_or:
+      return Hexagon::OR_pp;
+    case Intrinsic::hexagon_C2_not:
+      return Hexagon::NOT_pp;
+    case Intrinsic::hexagon_C2_any8:
+      return Hexagon::ANY_pp;
+    case Intrinsic::hexagon_C2_all8:
+      return Hexagon::ALL_pp;
+    case Intrinsic::hexagon_C2_vitpack:
+      return Hexagon::VITPACK_pp;
+    case Intrinsic::hexagon_C2_mask:
+      return Hexagon::MASK_p;
+    case Intrinsic::hexagon_C2_mux:
+      return Hexagon::MUX_rr;
+
+      // Mapping hexagon_C2_muxir to MUX_pri.  This is pretty weird - but
+      // that's how it's mapped in q6protos.h.
+    case Intrinsic::hexagon_C2_muxir:
+      return Hexagon::MUX_ri;
+
+      // Mapping hexagon_C2_muxri to MUX_pir.  This is pretty weird - but
+      // that's how it's mapped in q6protos.h.
+    case Intrinsic::hexagon_C2_muxri:
+      return Hexagon::MUX_ir;
+
+    case Intrinsic::hexagon_C2_muxii:
+      return Hexagon::MUX_ii;
+    case Intrinsic::hexagon_C2_vmux:
+      return Hexagon::VMUX_prr64;
+    case Intrinsic::hexagon_S2_valignrb:
+      return Hexagon::VALIGN_rrp;
+    case Intrinsic::hexagon_S2_vsplicerb:
+      return Hexagon::VSPLICE_rrp;
+  }
+}
+
+
+static bool OffsetFitsS11(EVT MemType, int64_t Offset) {
+  if (MemType == MVT::i64 && isShiftedInt<11,3>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i32 && isShiftedInt<11,2>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i16 && isShiftedInt<11,1>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i8 && isInt<11>(Offset)) {
+    return true;
+  }
+  return false;
+}
+
+
+//
+// Try to lower loads of GlobalAdresses into base+offset loads.  Custom
+// lowering for GlobalAddress nodes has already turned it into a
+// CONST32.
+//
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Chain = LD->getChain();
+  SDNode* Const32 = LD->getBasePtr().getNode();
+  unsigned Opcode = 0;
+
+  if (Const32->getOpcode() == HexagonISD::CONST32 &&
+      ISD::isNormalLoad(LD)) {
+    SDValue Base = Const32->getOperand(0);
+    EVT LoadedVT = LD->getMemoryVT();
+    int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
+    if (Offset != 0 && OffsetFitsS11(LoadedVT, Offset)) {
+      MVT PointerTy = TLI.getPointerTy();
+      const GlobalValue* GV =
+        cast<GlobalAddressSDNode>(Base)->getGlobal();
+      SDValue TargAddr =
+        CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0);
+      SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set,
+                                               dl, PointerTy,
+                                               TargAddr);
+      // Figure out base + offset opcode
+      if (LoadedVT == MVT::i64) Opcode = Hexagon::LDrid_indexed;
+      else if (LoadedVT == MVT::i32) Opcode = Hexagon::LDriw_indexed;
+      else if (LoadedVT == MVT::i16) Opcode = Hexagon::LDrih_indexed;
+      else if (LoadedVT == MVT::i8) Opcode = Hexagon::LDrib_indexed;
+      else assert (0 && "unknown memory type");
+
+      // Build indexed load.
+      SDValue TargetConstOff = CurDAG->getTargetConstant(Offset, PointerTy);
+      SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                              LD->getValueType(0),
+                                              MVT::Other,
+                                              SDValue(NewBase,0),
+                                              TargetConstOff,
+                                              Chain);
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+      ReplaceUses(LD, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
+                                                           unsigned Opcode,
+                                                           DebugLoc dl)
+{
+  SDValue Chain = LD->getChain();
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  SDValue N1 = LD->getOperand(1);
+  SDValue CPTmpN1_0;
+  SDValue CPTmpN1_1;
+  if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
+      N1.getNode()->getValueType(0) == MVT::i32) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+      SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32);
+      SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
+                                                MVT::Other, Base, TargetConst,
+                                                Chain);
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl, MVT::i64,
+                                                SDValue(Result_1, 0));
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 1),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result_2, 0),
+                                SDValue(Result_1, 1),
+                                SDValue(Result_1, 2)
+      };
+      ReplaceUses(Froms, Tos, 3);
+      return Result_2;
+    } 
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                              MVT::Other, Base, TargetConst0,
+                                              Chain);
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl,
+                                                MVT::i64, SDValue(Result_1, 0));
+    SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl,
+                                              MVT::i32, Base, TargetConstVal,
+                                                SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_2, 0),
+                              SDValue(Result_3, 0),
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_2;
+  }
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
+                                                           unsigned Opcode,
+                                                           DebugLoc dl)
+{
+  SDValue Chain = LD->getChain();
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  SDValue N1 = LD->getOperand(1);
+  SDValue CPTmpN1_0;
+  SDValue CPTmpN1_1;
+  if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
+      N1.getNode()->getValueType(0) == MVT::i32) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+      SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                                MVT::i32, MVT::Other, Base,
+                                                TargetConstVal, Chain);
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                                                TargetConst0);
+      SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                                MVT::i64, MVT::Other,
+                                                SDValue(Result_2,0),
+                                                SDValue(Result_1,0));
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 1),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result_3, 0),
+                                SDValue(Result_1, 1),
+                                SDValue(Result_1, 2)
+      };
+      ReplaceUses(Froms, Tos, 3);
+      return Result_3;
+    }
+
+    // Generate an indirect load.
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                              MVT::Other,
+                                              Base, TargetConst0, Chain);
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                                              TargetConst0);
+    SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                              MVT::i64, MVT::Other,
+                                              SDValue(Result_2,0),
+                                              SDValue(Result_1,0));
+    // Add offset to base.
+    SDNode* Result_4 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                              Base, TargetConstVal,
+                                              SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_3, 0), // Load value.
+                              SDValue(Result_4, 0), // New address.
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_3;
+  }
+
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  // Get the constant value.
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  EVT LoadedVT = LD->getMemoryVT();
+  unsigned Opcode = 0;
+
+  // Check for zero ext loads.
+  bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD);
+
+  // Figure out the opcode.
+  if (LoadedVT == MVT::i64) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = Hexagon::POST_LDrid;
+    else
+      Opcode = Hexagon::LDrid;
+  } else if (LoadedVT == MVT::i32) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = Hexagon::POST_LDriw;
+    else
+      Opcode = Hexagon::LDriw;
+  } else if (LoadedVT == MVT::i16) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = zextval ? Hexagon::POST_LDriuh : Hexagon::POST_LDrih;
+    else
+      Opcode = zextval ? Hexagon::LDriuh : Hexagon::LDrih;
+  } else if (LoadedVT == MVT::i8) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = zextval ? Hexagon::POST_LDriub : Hexagon::POST_LDrib;
+    else
+      Opcode = zextval ? Hexagon::LDriub : Hexagon::LDrib;
+  } else
+    assert (0 && "unknown memory type");
+
+  // For zero ext i64 loads, we need to add combine instructions.
+  if (LD->getValueType(0) == MVT::i64 &&
+      LD->getExtensionType() == ISD::ZEXTLOAD) {
+    return SelectIndexedLoadZeroExtend64(LD, Opcode, dl);
+  }
+  if (LD->getValueType(0) == MVT::i64 &&
+             LD->getExtensionType() == ISD::SEXTLOAD) {
+    // Handle sign ext i64 loads.
+    return SelectIndexedLoadSignExtend64(LD, Opcode, dl);
+  }
+  if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                            LD->getValueType(0),
+                                            MVT::i32, MVT::Other, Base,
+                                            TargetConstVal, Chain);
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result, 0),
+                              SDValue(Result, 1),
+                              SDValue(Result, 2)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result;
+  } else {
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl,
+                                              LD->getValueType(0),
+                                              MVT::Other, Base, TargetConst0,
+                                              Chain);
+    SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                              Base, TargetConstVal,
+                                              SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_1, 0),
+                              SDValue(Result_2, 0),
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_1;
+  }
+
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
+  SDNode *result;
+  DebugLoc dl = N->getDebugLoc();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+
+  // Handle indexed loads.
+  if (AM != ISD::UNINDEXED) {
+    result = SelectIndexedLoad(LD, dl);
+  } else {
+    result = SelectBaseOffsetLoad(LD, dl);
+  }
+
+  return result;
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
+  SDValue Chain = ST->getChain();
+  SDValue Base = ST->getBasePtr();
+  SDValue Offset = ST->getOffset();
+  SDValue Value = ST->getValue();
+  SDNode *OffsetNode = Offset.getNode();
+  // Get the constant value.
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  EVT StoredVT = ST->getMemoryVT();
+
+  // Offset value must be within representable range
+  // and must have correct alignment properties.
+  if (TII->isValidAutoIncImm(StoredVT, Val)) {
+    SDValue Ops[] = { Value, Base,
+                      CurDAG->getTargetConstant(Val, MVT::i32), Chain};
+    unsigned Opcode = 0;
+
+    // Figure out the post inc version of opcode.
+    if (StoredVT == MVT::i64) Opcode = Hexagon::POST_STdri;
+    else if (StoredVT == MVT::i32) Opcode = Hexagon::POST_STwri;
+    else if (StoredVT == MVT::i16) Opcode = Hexagon::POST_SThri;
+    else if (StoredVT == MVT::i8) Opcode = Hexagon::POST_STbri;
+    else assert (0 && "unknown memory type");
+
+    // Build post increment store.
+    SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                            MVT::Other, Ops, 4);
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = ST->getMemOperand();
+    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+
+    ReplaceUses(ST, Result);
+    ReplaceUses(SDValue(ST,1), SDValue(Result,1));
+    return Result;
+  }
+
+  // Note: Order of operands matches the def of instruction:
+  // def STrid : STInst<(outs), (ins MEMri:$addr, DoubleRegs:$src1), ...
+  // and it differs for POST_ST* for instance.
+  SDValue Ops[] = { Base, CurDAG->getTargetConstant(0, MVT::i32), Value,
+                    Chain};
+  unsigned Opcode = 0;
+
+  // Figure out the opcode.
+  if (StoredVT == MVT::i64) Opcode = Hexagon::STrid;
+  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw;
+  else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih;
+  else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib;
+  else assert (0 && "unknown memory type");
+
+  // Build regular store.
+  SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+  SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops,
+                                            4);
+  // Build splitted incriment instruction.
+  SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                            Base,
+                                            TargetConstVal,
+                                            SDValue(Result_1, 0));
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = ST->getMemOperand();
+  cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(ST,0), SDValue(Result_2,0));
+  ReplaceUses(SDValue(ST,1), SDValue(Result_1,0));
+  return Result_2;
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
+                                                   DebugLoc dl) {
+  SDValue Chain = ST->getChain();
+  SDNode* Const32 = ST->getBasePtr().getNode();
+  SDValue Value = ST->getValue();
+  unsigned Opcode = 0;
+
+  // Try to lower stores of GlobalAdresses into indexed stores.  Custom
+  // lowering for GlobalAddress nodes has already turned it into a
+  // CONST32.  Avoid truncating stores for the moment.  Post-inc stores
+  // do the same.  Don't think there's a reason for it, so will file a
+  // bug to fix.
+  if ((Const32->getOpcode() == HexagonISD::CONST32) &&
+      !(Value.getValueType() == MVT::i64 && ST->isTruncatingStore())) {
+    SDValue Base = Const32->getOperand(0);
+    if (Base.getOpcode() == ISD::TargetGlobalAddress) {
+      EVT StoredVT = ST->getMemoryVT();
+      int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
+      if (Offset != 0 && OffsetFitsS11(StoredVT, Offset)) {
+        MVT PointerTy = TLI.getPointerTy();
+        const GlobalValue* GV =
+          cast<GlobalAddressSDNode>(Base)->getGlobal();
+        SDValue TargAddr =
+          CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0);
+        SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set,
+                                                 dl, PointerTy,
+                                                 TargAddr);
+
+        // Figure out base + offset opcode
+        if (StoredVT == MVT::i64) Opcode = Hexagon::STrid_indexed;
+        else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
+        else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih_indexed;
+        else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib_indexed;
+        else assert (0 && "unknown memory type");
+
+        SDValue Ops[] = {SDValue(NewBase,0),
+                         CurDAG->getTargetConstant(Offset,PointerTy),
+                         Value, Chain};
+        // build indexed store
+        SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                                MVT::Other, Ops, 4);
+        MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+        MemOp[0] = ST->getMemOperand();
+        cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+        ReplaceUses(ST, Result);
+        return Result;
+      }
+    }
+  }
+
+  return SelectCode(ST);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  ISD::MemIndexedMode AM = ST->getAddressingMode();
+
+  // Handle indexed stores.
+  if (AM != ISD::UNINDEXED) {
+    return SelectIndexedStore(ST, dl);
+  }
+   
+  return SelectBaseOffsetStore(ST, dl);
+}
+
+SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+
+  //
+  // %conv.i = sext i32 %tmp1 to i64
+  // %conv2.i = sext i32 %add to i64
+  // %mul.i = mul nsw i64 %conv2.i, %conv.i
+  //
+  //   --- match with the following ---
+  //
+  // %mul.i = mpy (%tmp1, %add)
+  //
+
+  if (N->getValueType(0) == MVT::i64) {
+    // Shifting a i64 signed multiply.
+    SDValue MulOp0 = N->getOperand(0);
+    SDValue MulOp1 = N->getOperand(1);
+
+    SDValue OP0;
+    SDValue OP1;
+
+    // Handle sign_extend and sextload.
+    if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext0 = MulOp0.getOperand(0);
+      if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+        SelectCode(N);
+      }
+
+      OP0 = Sext0;
+    } else if (MulOp0.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        SelectCode(N);
+      }
+
+      SDValue Base = LD->getBasePtr();
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      return SelectCode(N);
+    }
+
+    // Same goes for the second operand.
+    if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext1 = MulOp1.getOperand(0);
+      if (Sext1.getNode()->getValueType(0) != MVT::i32) {
+        return SelectCode(N);
+      }
+
+      OP1 = Sext1;
+    } else if (MulOp1.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        return SelectCode(N);
+      }
+
+      SDValue Base = LD->getBasePtr();
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      return SelectCode(N);
+    }
+
+    // Generate a mpy instruction.
+    SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY64, dl, MVT::i64,
+                                            OP0, OP1);
+    ReplaceUses(N, Result);
+    return Result;
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+      SDValue N000 = N00.getOperand(0);
+      SDValue N001 = N00.getOperand(1);
+      if (cast<VTSDNode>(N001)->getVT() == MVT::i16) {
+        SDValue N01 = N0.getOperand(1);
+        SDValue N02 = N0.getOperand(2);
+
+        // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2,
+        // i16:Other),IntRegs:i32:$src1, SETLT:Other),IntRegs:i32:$src1,
+        // IntRegs:i32:$src2)
+        // Emits: (MAXh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2)
+        // Pattern complexity = 9  cost = 1  size = 0.
+        if (cast<CondCodeSDNode>(N02)->get() == ISD::SETLT) {
+          SDValue N1 = N->getOperand(1);
+          if (N01 == N1) {
+            SDValue N2 = N->getOperand(2);
+            if (N000 == N2 &&
+                N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
+                N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+                                                        MVT::i32, N000);
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::MAXw_rr, dl,
+                                                      MVT::i32,
+                                                      SDValue(SextNode, 0),
+                                                      N1);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+          }
+        }
+
+        // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2,
+        // i16:Other), IntRegs:i32:$src1, SETGT:Other), IntRegs:i32:$src1,
+        // IntRegs:i32:$src2)
+        // Emits: (MINh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2)
+        // Pattern complexity = 9  cost = 1  size = 0.
+        if (cast<CondCodeSDNode>(N02)->get() == ISD::SETGT) {
+          SDValue N1 = N->getOperand(1);
+          if (N01 == N1) {
+            SDValue N2 = N->getOperand(2);
+            if (N000 == N2 &&
+                N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
+                N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+                                                        MVT::i32, N000);
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::MINw_rr, dl,
+                                                      MVT::i32,
+                                                      SDValue(SextNode, 0),
+                                                      N1);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Shift = N->getOperand(0);
+
+  //
+  // %conv.i = sext i32 %tmp1 to i64
+  // %conv2.i = sext i32 %add to i64
+  // %mul.i = mul nsw i64 %conv2.i, %conv.i
+  // %shr5.i = lshr i64 %mul.i, 32
+  // %conv3.i = trunc i64 %shr5.i to i32
+  //
+  //   --- match with the following ---
+  //
+  // %conv3.i = mpy (%tmp1, %add)
+  //
+  // Trunc to i32.
+  if (N->getValueType(0) == MVT::i32) {
+    // Trunc from i64.
+    if (Shift.getNode()->getValueType(0) == MVT::i64) {
+      // Trunc child is logical shift right.
+      if (Shift.getOpcode() != ISD::SRL) {
+        return SelectCode(N);
+      }
+
+      SDValue ShiftOp0 = Shift.getOperand(0);
+      SDValue ShiftOp1 = Shift.getOperand(1);
+
+      // Shift by const 32
+      if (ShiftOp1.getOpcode() != ISD::Constant) {
+        return SelectCode(N);
+      }
+
+      int32_t ShiftConst =
+        cast<ConstantSDNode>(ShiftOp1.getNode())->getSExtValue();
+      if (ShiftConst != 32) {
+        return SelectCode(N);
+      }
+
+      // Shifting a i64 signed multiply
+      SDValue Mul = ShiftOp0;
+      if (Mul.getOpcode() != ISD::MUL) {
+        return SelectCode(N);
+      }
+
+      SDValue MulOp0 = Mul.getOperand(0);
+      SDValue MulOp1 = Mul.getOperand(1);
+
+      SDValue OP0;
+      SDValue OP1;
+
+      // Handle sign_extend and sextload
+      if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+        SDValue Sext0 = MulOp0.getOperand(0);
+        if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+          return SelectCode(N);
+        }
+
+        OP0 = Sext0;
+      } else if (MulOp0.getOpcode() == ISD::LOAD) {
+        LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+        if (LD->getMemoryVT() != MVT::i32 ||
+            LD->getExtensionType() != ISD::SEXTLOAD ||
+            LD->getAddressingMode() != ISD::UNINDEXED) {
+          return SelectCode(N);
+        }
+
+        SDValue Base = LD->getBasePtr();
+        SDValue Chain = LD->getChain();
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                              MVT::Other,
+                                              LD->getBasePtr(),
+                                              TargetConst0, Chain), 0);
+      } else {
+        return SelectCode(N);
+      }
+
+      // Same goes for the second operand.
+      if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+        SDValue Sext1 = MulOp1.getOperand(0);
+        if (Sext1.getNode()->getValueType(0) != MVT::i32)
+          return SelectCode(N);
+
+        OP1 = Sext1;
+      } else if (MulOp1.getOpcode() == ISD::LOAD) {
+        LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+        if (LD->getMemoryVT() != MVT::i32 ||
+            LD->getExtensionType() != ISD::SEXTLOAD ||
+            LD->getAddressingMode() != ISD::UNINDEXED) {
+          return SelectCode(N);
+        }
+
+        SDValue Base = LD->getBasePtr();
+        SDValue Chain = LD->getChain();
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                              MVT::Other,
+                                              LD->getBasePtr(),
+                                              TargetConst0, Chain), 0);
+      } else {
+        return SelectCode(N);
+      }
+
+      // Generate a mpy instruction.
+      SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY, dl, MVT::i32,
+                                              OP0, OP1);
+      ReplaceUses(N, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) == MVT::i32) {
+    SDValue Shl_0 = N->getOperand(0);
+    SDValue Shl_1 = N->getOperand(1);
+    // RHS is const.
+    if (Shl_1.getOpcode() == ISD::Constant) {
+      if (Shl_0.getOpcode() == ISD::MUL) {
+        SDValue Mul_0 = Shl_0.getOperand(0); // Val
+        SDValue Mul_1 = Shl_0.getOperand(1); // Const
+        // RHS of mul is const.
+        if (Mul_1.getOpcode() == ISD::Constant) {
+          int32_t ShlConst =
+            cast<ConstantSDNode>(Shl_1.getNode())->getSExtValue();
+          int32_t MulConst =
+            cast<ConstantSDNode>(Mul_1.getNode())->getSExtValue();
+          int32_t ValConst = MulConst << ShlConst;
+          SDValue Val = CurDAG->getTargetConstant(ValConst,
+                                                  MVT::i32);
+          if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val.getNode()))
+            if (isInt<9>(CN->getSExtValue())) {
+              SDNode* Result =
+                CurDAG->getMachineNode(Hexagon::MPYI_ri, dl,
+                                       MVT::i32, Mul_0, Val);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+
+        }
+      } else if (Shl_0.getOpcode() == ISD::SUB) {
+        SDValue Sub_0 = Shl_0.getOperand(0); // Const 0
+        SDValue Sub_1 = Shl_0.getOperand(1); // Val
+        if (Sub_0.getOpcode() == ISD::Constant) {
+          int32_t SubConst =
+            cast<ConstantSDNode>(Sub_0.getNode())->getSExtValue();
+          if (SubConst == 0) {
+            if (Sub_1.getOpcode() == ISD::SHL) {
+              SDValue Shl2_0 = Sub_1.getOperand(0); // Val
+              SDValue Shl2_1 = Sub_1.getOperand(1); // Const
+              if (Shl2_1.getOpcode() == ISD::Constant) {
+                int32_t ShlConst =
+                  cast<ConstantSDNode>(Shl_1.getNode())->getSExtValue();
+                int32_t Shl2Const =
+                  cast<ConstantSDNode>(Shl2_1.getNode())->getSExtValue();
+                int32_t ValConst = 1 << (ShlConst+Shl2Const);
+                SDValue Val = CurDAG->getTargetConstant(-ValConst, MVT::i32);
+                if (ConstantSDNode *CN =
+                    dyn_cast<ConstantSDNode>(Val.getNode()))
+                  if (isInt<9>(CN->getSExtValue())) {
+                    SDNode* Result =
+                      CurDAG->getMachineNode(Hexagon::MPYI_ri, dl, MVT::i32,
+                                             Shl2_0, Val);
+                    ReplaceUses(N, Result);
+                    return Result;
+                  }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return SelectCode(N);
+}
+
+
+//
+// If there is an zero_extend followed an intrinsic in DAG (this means - the
+// result of the intrinsic is predicate); convert the zero_extend to
+// transfer instruction.
+//
+// Zero extend -> transfer is lowered here. Otherwise, zero_extend will be
+// converted into a MUX as predicate registers defined as 1 bit in the
+// compiler. Architecture defines them as 8-bit registers.
+// We want to preserve all the lower 8-bits and, not just 1 LSB bit.
+//
+SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDNode *IsIntrinsic = N->getOperand(0).getNode();
+  if ((IsIntrinsic->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
+    unsigned ID =
+      cast<ConstantSDNode>(IsIntrinsic->getOperand(0))->getZExtValue();
+    if (doesIntrinsicReturnPredicate(ID)) {
+      // Now we need to differentiate target data types.
+      if (N->getValueType(0) == MVT::i64) {
+        // Convert the zero_extend to Rs = Pd followed by COMBINE_rr(0,Rs).
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+                                                  MVT::i32,
+                                                  SDValue(IsIntrinsic, 0));
+        SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl,
+                                                  MVT::i32,
+                                                  TargetConst0);
+        SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                                  MVT::i64, MVT::Other,
+                                                  SDValue(Result_2, 0),
+                                                  SDValue(Result_1, 0));
+        ReplaceUses(N, Result_3);
+        return Result_3;
+      }
+      if (N->getValueType(0) == MVT::i32) {
+        // Convert the zero_extend to Rs = Pd
+        SDNode* RsPd = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+                                              MVT::i32,
+                                              SDValue(IsIntrinsic, 0));
+        ReplaceUses(N, RsPd);
+        return RsPd;
+      }
+      assert(0 && "Unexpected value type");
+    }
+  }
+  return SelectCode(N);
+}
+
+
+//
+// Checking for intrinsics which have predicate registers as operand(s)
+// and lowering to the actual intrinsic.
+//
+SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  unsigned ID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntrinsicWithPred = doesIntrinsicContainPredicate(ID);
+
+  // We are concerned with only those intrinsics that have predicate registers
+  // as at least one of the operands.
+  if (IntrinsicWithPred) {
+    SmallVector<SDValue, 8> Ops;
+    const MCInstrDesc &MCID = TII->get(IntrinsicWithPred);
+    const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+    // Iterate over all the operands of the intrinsics.
+    // For PredRegs, do the transfer.
+    // For Double/Int Regs, just preserve the value
+    // For immediates, lower it.
+    for (unsigned i = 1; i < N->getNumOperands(); ++i) {
+      SDNode *Arg = N->getOperand(i).getNode();
+      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI);
+
+      if (RC == Hexagon::IntRegsRegisterClass ||
+          RC == Hexagon::DoubleRegsRegisterClass) {
+        Ops.push_back(SDValue(Arg, 0));
+      } else if (RC == Hexagon::PredRegsRegisterClass) {
+        // Do the transfer.
+        SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
+                                              SDValue(Arg, 0));
+        Ops.push_back(SDValue(PdRs,0));
+      } else if (RC == NULL && (dyn_cast<ConstantSDNode>(Arg) != NULL)) {
+        // This is immediate operand. Lower it here making sure that we DO have
+        // const SDNode for immediate value.
+        int32_t Val = cast<ConstantSDNode>(Arg)->getSExtValue();
+        SDValue SDVal = CurDAG->getTargetConstant(Val, MVT::i32);
+        Ops.push_back(SDVal);
+      } else {
+        assert(0 && "Unimplemented");
+      }
+    }
+    EVT ReturnValueVT = N->getValueType(0);
+    SDNode *Result = CurDAG->getMachineNode(IntrinsicWithPred, dl,
+                                            ReturnValueVT,
+                                            Ops.data(), Ops.size());
+    ReplaceUses(N, Result);
+    return Result;
+  }
+  return SelectCode(N);
+}
+
+
+//
+// Map predicate true (encoded as -1 in LLVM) to a XOR.
+//
+SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) == MVT::i1) {
+    SDNode* Result;
+    int32_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (Val == -1) {
+      unsigned NewIntReg = TM.getInstrInfo()->createVR(MF, MVT(MVT::i32));
+      SDValue Reg = CurDAG->getRegister(NewIntReg, MVT::i32);
+
+      // Create the IntReg = 1 node.
+      SDNode* IntRegTFR =
+        CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                               CurDAG->getTargetConstant(0, MVT::i32));
+
+      // Pd = IntReg
+      SDNode* Pd = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
+                                          SDValue(IntRegTFR, 0));
+
+      // not(Pd)
+      SDNode* NotPd = CurDAG->getMachineNode(Hexagon::NOT_pp, dl, MVT::i1,
+                                             SDValue(Pd, 0));
+
+      // xor(not(Pd))
+      Result = CurDAG->getMachineNode(Hexagon::XOR_pp, dl, MVT::i1,
+                                      SDValue(Pd, 0), SDValue(NotPd, 0));
+
+      // We have just built:
+      // Rs = Pd
+      // Pd = xor(not(Pd), Pd)
+
+      ReplaceUses(N, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Map add followed by a asr -> asr +=.
+//
+SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) != MVT::i32) {
+    return SelectCode(N);
+  }
+  // Identify nodes of the form: add(asr(...)).
+  SDNode* Src1 = N->getOperand(0).getNode();
+  if (Src1->getOpcode() != ISD::SRA || !Src1->hasOneUse()
+      || Src1->getValueType(0) != MVT::i32) {
+    return SelectCode(N);
+  }
+
+  // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that
+  // Rd and Rd' are assigned to the same register
+  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_rr_acc, dl, MVT::i32,
+                                          N->getOperand(1),
+                                          Src1->getOperand(0),
+                                          Src1->getOperand(1));
+  ReplaceUses(N, Result);
+
+  return Result;
+}
+
+
+SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+
+  switch (N->getOpcode()) {
+  case ISD::Constant:
+    return SelectConstant(N);
+
+  case ISD::ADD:
+    return SelectAdd(N);
+
+  case ISD::SHL:
+    return SelectSHL(N);
+
+  case ISD::LOAD:
+    return SelectLoad(N);
+
+  case ISD::STORE:
+    return SelectStore(N);
+
+  case ISD::SELECT:
+    return SelectSelect(N);
+
+  case ISD::TRUNCATE:
+    return SelectTruncate(N);
+
+  case ISD::MUL:
+    return SelectMul(N);
+
+  case ISD::ZERO_EXTEND:
+    return SelectZeroExtend(N);
+
+  case ISD::INTRINSIC_WO_CHAIN:
+    return SelectIntrinsicWOChain(N);
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Hexagon_TODO: Five functions for ADDRri?! Surely there must be a better way
+// to define these instructions.
+//
+bool HexagonDAGToDAGISel::SelectADDRri(SDValue& Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_0(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_0_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_0_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_1(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_1_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_1_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_2(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_2_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_2_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_0(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_0_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_0_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_1(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_1_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_1_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_2(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_2_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_2_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectMEMriS11_2(SDValue& Addr, SDValue &Base,
+                                           SDValue &Offset) {
+
+  if (Addr.getOpcode() != ISD::ADD) {
+    return(SelectADDRriS11_2(Addr, Base, Offset));
+  }
+
+  return SelectADDRriS11_2(Addr, Base, Offset);
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_3(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_3_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_3_Offset(Offset.getNode()));
+}
+
+bool HexagonDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1,
+                                       SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<13>(CN->getSExtValue()))
+        return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+
+  return true;
+}
+
+
+// Handle generic address case. It is accessed from inlined asm =m constraints,
+// which could have any kind of pointer.
+bool HexagonDAGToDAGISel::SelectAddr(SDNode *Op, SDValue Addr,
+                                          SDValue &Base, SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+bool HexagonDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+
+  switch (ConstraintCode) {
+  case 'o':   // Offsetable.
+  case 'v':   // Not offsetable.
+  default: return true;
+  case 'm':   // Memory.
+    if (!SelectAddr(Op.getNode(), Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
new file mode 100644
index 0000000..0ac3cf0
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -0,0 +1,1505 @@
+//===-- HexagonISelLowering.cpp - Hexagon DAG Lowering Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Hexagon uses to lower LLVM code
+// into a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonISelLowering.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonTargetObjectFile.h"
+#include "HexagonSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+const unsigned Hexagon_MAX_RET_SIZE = 64;
+using namespace llvm;
+
+static cl::opt<bool>
+EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
+               cl::desc("Control jump table emission on Hexagon target"));
+
+int NumNamedVarArgParams = -1;
+
+// Implement calling convention for Hexagon.
+static bool
+CC_Hexagon(unsigned ValNo, MVT ValVT,
+           MVT LocVT, CCValAssign::LocInfo LocInfo,
+           ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon32(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon64(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+              MVT LocVT, CCValAssign::LocInfo LocInfo,
+              ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
+            MVT LocVT, CCValAssign::LocInfo LocInfo,
+            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  // NumNamedVarArgParams can not be zero for a VarArg function.
+  assert ( (NumNamedVarArgParams > 0) &&
+           "NumNamedVarArgParams is not bigger than zero.");
+
+  if ( (int)ValNo < NumNamedVarArgParams ) {
+    // Deal with named arguments.
+    return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+  }
+
+  // Deal with un-named arguments.
+  unsigned ofst;
+  if (ArgFlags.isByVal()) {
+    // If pass-by-value, the size allocated on stack is decided
+    // by ArgFlags.getByValSize(), not by the size of LocVT.
+    assert ((ArgFlags.getByValSize() > 8) &&
+            "ByValSize must be bigger than 8 bytes");
+    ofst = State.AllocateStack(ArgFlags.getByValSize(), 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i32) {
+    ofst = State.AllocateStack(4, 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i64) {
+    ofst = State.AllocateStack(8, 8);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  llvm_unreachable(0);
+
+  return true;
+}
+
+
+static bool
+CC_Hexagon (unsigned ValNo, MVT ValVT,
+            MVT LocVT, CCValAssign::LocInfo LocInfo,
+            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (ArgFlags.isByVal()) {
+    // Passed on stack.
+    assert ((ArgFlags.getByValSize() > 8) &&
+            "ByValSize must be bigger than 8 bytes");
+    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (LocVT == MVT::i32) {
+    if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  if (LocVT == MVT::i64) {
+    if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  return true;  // CC didn't match.
+}
+
+
+static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  static const unsigned RegList[] = {
+    Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+    Hexagon::R5
+  };
+  if (unsigned Reg = State.AllocateReg(RegList, 6)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  static const unsigned RegList1[] = {
+    Hexagon::D1, Hexagon::D2
+  };
+  static const unsigned RegList2[] = {
+    Hexagon::R1, Hexagon::R3
+  };
+  if (unsigned Reg = State.AllocateReg(RegList1, RegList2, 2)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+                          MVT LocVT, CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+
+  if (LocVT == MVT::i1 ||
+      LocVT == MVT::i8 ||
+      LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (LocVT == MVT::i32) {
+    if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+    return false;
+  }
+
+  if (LocVT == MVT::i64) {
+    if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+    return false;
+  }
+
+  return true;  // CC didn't match.
+}
+
+static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (LocVT == MVT::i32) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::R0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  if (LocVT == MVT::i64) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+SDValue
+HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
+const {
+  return SDValue();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.  Sometimes what we are copying is the end of a
+/// larger object, the part that does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile=*/false, /*AlwaysInline=*/false,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+
+// LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
+// passed by value, the function prototype is modified to return void and
+// the value is stored in memory pointed by a pointer passed by caller.
+SDValue
+HexagonTargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   DebugLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze return values of ISD::RET
+  CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+
+  SDValue StackPtr = DAG.getRegister(TM.getRegisterInfo()->getStackRegister(),
+                                     MVT::i32);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    SDValue Ret = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+
+  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDValue
+HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                       CallingConv::ID CallConv, bool isVarArg,
+                                       const
+                                       SmallVectorImpl<ISD::InputArg> &Ins,
+                                       DebugLoc dl, SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals,
+                                       const SmallVectorImpl<SDValue> &OutVals,
+                                       SDValue Callee) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl,
+                               RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+/// LowerCall - Functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 bool &isTailCall,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 DebugLoc dl, SelectionDAG &DAG,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // Check for varargs.
+  NumNamedVarArgParams = -1;
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
+  {
+    const Function* CalleeFn = NULL;
+    Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32);
+    if ((CalleeFn = dyn_cast<Function>(GA->getGlobal())))
+    {
+      // If a function has zero args and is a vararg function, that's
+      // disallowed so it must be an undeclared function.  Do not assume
+      // varargs if the callee is undefined.
+      if (CalleeFn->isVarArg() &&
+          CalleeFn->getFunctionType()->getNumParams() != 0) {
+        NumNamedVarArgParams = CalleeFn->getFunctionType()->getNumParams();
+      }
+    }
+  }
+
+  if (NumNamedVarArgParams > 0)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+  else
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
+
+
+  if(isTailCall) {
+    bool StructAttrFlag =
+      DAG.getMachineFunction().getFunction()->hasStructRetAttr();
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                                                   isVarArg, IsStructRet,
+                                                   StructAttrFlag,
+                                                   Outs, OutVals, Ins, DAG);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i){
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isMemLoc()) {
+        isTailCall = false;
+        break;
+      }
+    }
+    if (isTailCall) {
+      DEBUG(dbgs () << "Eligible for Tail Call\n");
+    } else {
+      DEBUG(dbgs () <<
+            "Argument must be passed on stack. Not eligible for Tail Call\n");
+    }
+  }
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  SDValue StackPtr =
+    DAG.getCopyFromReg(Chain, dl, TM.getRegisterInfo()->getStackRegister(),
+                       getPointerTy());
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default:
+        // Loc info must be one of Full, SExt, ZExt, or AExt.
+        assert(0 && "Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    if (VA.isMemLoc()) {
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      SDValue PtrOff = DAG.getConstant(LocMemOffset, StackPtr.getValueType());
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+
+      if (Flags.isByVal()) {
+        // The argument is a struct passed by value. According to LLVM, "Arg"
+        // is is pointer.
+        MemOpChains.push_back(CreateCopyOfByValArgument(Arg, PtrOff, Chain,
+                                                        Flags, DAG, dl));
+      } else {
+        // The argument is not passed by value. "Arg" is a buildin type. It is
+        // not a pointer.
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),false, false,
+                                           0));
+      }
+      continue;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector.
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0],
+                        MemOpChains.size());
+  }
+
+  if (!isTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes,
+                                                        getPointerTy(), true));
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  if (!isTailCall) {
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    //
+    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (flag_aligned_memcpy) {
+    const char *MemcpyName =
+      "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
+    Callee =
+      DAG.getTargetExternalSymbol(MemcpyName, getPointerTy());
+    flag_aligned_memcpy = false;
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy());
+  } else if (ExternalSymbolSDNode *S =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+  }
+
+  if (InFlag.getNode()) {
+    Ops.push_back(InFlag);
+  }
+
+  if (isTailCall)
+    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, OutVals, Callee);
+}
+
+static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                   bool isSEXTLoad, SDValue &Base,
+                                   SDValue &Offset, bool &isInc,
+                                   SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD)
+  return false;
+
+  if (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    // Ensure that Offset is a constant.
+    return (isa<ConstantSDNode>(Offset));
+  }
+
+  return false;
+}
+
+// TODO: Put this function along with the other isS* functions in
+// HexagonISelDAGToDAG.cpp into a common file. Or better still, use the
+// functions defined in HexagonImmediates.td.
+static bool Is_PostInc_S4_Offset(SDNode * S, int ShiftAmount) {
+  ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS4 predicate - True if the immediate fits in a 4-bit sign extended.
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  int64_t m = 0;
+  if (ShiftAmount > 0) {
+    m = v % ShiftAmount;
+    v = v >> ShiftAmount;
+  }
+  return (v <= 7) && (v >= -8) && (m == 0);
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                       SDValue &Base,
+                                                       SDValue &Offset,
+                                                       ISD::MemIndexedMode &AM,
+                                                       SelectionDAG &DAG) const
+{
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+    if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore()) {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  bool isInc;
+  bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  // ShiftAmount = number of left-shifted bits in the Hexagon instruction.
+  int ShiftAmount = VT.getSizeInBits() / 16;
+  if (isLegal && Is_PostInc_S4_Offset(Offset.getNode(), ShiftAmount)) {
+    AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+    return true;
+  }
+
+  return false;
+}
+
+SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  MachineFunction &MF = DAG.getMachineFunction();
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  switch (Node->getOpcode()) {
+    case ISD::INLINEASM: {
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the flag operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        if (FuncInfo->hasClobberLR())
+          break;
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+        ++i;  // Skip the ID value.
+
+        switch (InlineAsm::getKind(Flags)) {
+        default: llvm_unreachable("Bad flags!");
+          case InlineAsm::Kind_RegDef:
+          case InlineAsm::Kind_RegUse:
+          case InlineAsm::Kind_Imm:
+          case InlineAsm::Kind_Clobber:
+          case InlineAsm::Kind_Mem: {
+            for (; NumVals; --NumVals, ++i) {}
+            break;
+          }
+          case InlineAsm::Kind_RegDefEarlyClobber: {
+            for (; NumVals; --NumVals, ++i) {
+              unsigned Reg =
+                cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+
+              // Check it to be lr
+              if (Reg == TM.getRegisterInfo()->getRARegister()) {
+                FuncInfo->setHasClobberLR(true);
+                break;
+              }
+            }
+            break;
+          }
+        }
+      }
+    }
+  } // Node->getOpcode
+  return Op;
+}
+
+
+//
+// Taken from the XCore backend.
+//
+SDValue HexagonTargetLowering::
+LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  unsigned JTI = JT->getIndex();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+  // Mark all jump table targets as address taken.
+  const std::vector<MachineJumpTableEntry> &JTE = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JTE[JTI].MBBs;
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    MBB->setHasAddressTaken();
+    // This line is needed to set the hasAddressTaken flag on the BasicBlock
+    // object.
+    BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock()));
+  }
+
+  SDValue JumpTableBase = DAG.getNode(HexagonISD::WrapperJT, dl,
+                                      getPointerTy(), TargetJT);
+  SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+                                   DAG.getConstant(2, MVT::i32));
+  SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase,
+                                  ShiftIndex);
+  SDValue LoadTarget = DAG.getLoad(MVT::i32, dl, Chain, JTAddress,
+                                   MachinePointerInfo(), false, false, false,
+                                   0);
+  return DAG.getNode(HexagonISD::BR_JT, dl, MVT::Other, Chain, LoadTarget);
+}
+
+
+SDValue
+HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+  // Get a reference to the stack pointer.
+  SDValue StackPointer = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
+
+  // Subtract the dynamic size from the actual stack size to
+  // obtain the new stack size.
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, StackPointer, Size);
+
+  //
+  // For Hexagon, the outgoing memory arguments area should be on top of the
+  // alloca area on the stack i.e., the outgoing memory arguments should be
+  // at a lower address than the alloca area. Move the alloca area down the
+  // stack by adding back the space reserved for outgoing arguments to SP
+  // here.
+  //
+  // We do not know what the size of the outgoing args is at this point.
+  // So, we add a pseudo instruction ADJDYNALLOC that will adjust the
+  // stack pointer. We patch this instruction with the correct, known
+  // offset in emitPrologue().
+  //
+  // Use a placeholder immediate (zero) for now. This will be patched up
+  // by emitPrologue().
+  SDValue ArgAdjust = DAG.getNode(HexagonISD::ADJDYNALLOC, dl,
+                                  MVT::i32,
+                                  Sub,
+                                  DAG.getConstant(0, MVT::i32));
+
+  // The Sub result contains the new stack start address, so it
+  // must be placed in the stack pointer register.
+  SDValue CopyChain = DAG.getCopyToReg(Chain, dl,
+                                       TM.getRegisterInfo()->getStackRegister(),
+                                       Sub);
+
+  SDValue Ops[2] = { ArgAdjust, CopyChain };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue
+HexagonTargetLowering::LowerFormalArguments(SDValue Chain,
+                                            CallingConv::ID CallConv,
+                                            bool isVarArg,
+                                            const
+                                            SmallVectorImpl<ISD::InputArg> &Ins,
+                                            DebugLoc dl, SelectionDAG &DAG,
+                                            SmallVectorImpl<SDValue> &InVals)
+const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+
+  // For LLVM, in the case when returning a struct by value (>8byte),
+  // the first argument is a pointer that points to the location on caller's
+  // stack where the return value will be stored. For Hexagon, the location on
+  // caller's stack is passed only when the struct size is smaller than (and
+  // equal to) 8 bytes. If not, no address will be passed into callee and
+  // callee return the result direclty through R0/R1.
+
+  SmallVector<SDValue, 4> MemOps;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    unsigned ObjSize;
+    unsigned StackLocation;
+    int FI;
+
+    if (   (VA.isRegLoc() && !Flags.isByVal())
+        || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
+      // Arguments passed in registers
+      // 1. int, long long, ptr args that get allocated in register.
+      // 2. Large struct that gets an register to put its address in.
+      EVT RegVT = VA.getLocVT();
+      if (RegVT == MVT::i8 || RegVT == MVT::i16 || RegVT == MVT::i32) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(Hexagon::IntRegsRegisterClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (RegVT == MVT::i64) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(Hexagon::DoubleRegsRegisterClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else {
+        assert (0);
+      }
+    } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
+      assert (0 && "ByValSize must be bigger than 8 bytes");
+    } else {
+      // Sanity check.
+      assert(VA.isMemLoc());
+
+      if (Flags.isByVal()) {
+        // If it's a byval parameter, then we need to compute the
+        // "real" size, not the size of the pointer.
+        ObjSize = Flags.getByValSize();
+      } else {
+        ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
+      }
+
+      StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+      // Create the frame index object for this incoming parameter...
+      FI = MFI->CreateFixedObject(ObjSize, StackLocation, true);
+
+      // Create the SelectionDAG nodes cordl, responding to a load
+      // from this parameter.
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+
+      if (Flags.isByVal()) {
+        // If it's a pass-by-value aggregate, then do not dereference the stack
+        // location. Instead, we should generate a reference to the stack
+        // location.
+        InVals.push_back(FIN);
+      } else {
+        InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                                     MachinePointerInfo(), false, false,
+                                     false, 0));
+      }
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0],
+                        MemOps.size());
+
+  if (isVarArg) {
+    // This will point to the next argument passed via stack.
+    int FrameIndex = MFI->CreateFixedObject(Hexagon_PointerSize,
+                                            HEXAGON_LRFP_SIZE +
+                                            CCInfo.getNextStackOffset(),
+                                            true);
+    FuncInfo->setVarArgsFrameIndex(FrameIndex);
+  }
+
+  return Chain;
+}
+
+SDValue
+HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  // VASTART stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  MachineFunction &MF = DAG.getMachineFunction();
+  HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), Op.getDebugLoc(), Addr,
+                      Op.getOperand(1), MachinePointerInfo(SV), false,
+                      false, 0);
+}
+
+SDValue
+HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDNode* OpNode = Op.getNode();
+
+  SDValue Cond = DAG.getNode(ISD::SETCC, Op.getDebugLoc(), MVT::i1,
+                             Op.getOperand(2), Op.getOperand(3),
+                             Op.getOperand(4));
+  return DAG.getNode(ISD::SELECT, Op.getDebugLoc(), OpNode->getValueType(0),
+                     Cond, Op.getOperand(0),
+                     Op.getOperand(1));
+}
+
+SDValue
+HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue
+HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  const HexagonRegisterInfo  *TRI = TM.getRegisterInfo();
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         TRI->getFrameRegister(), VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+
+SDValue HexagonTargetLowering::LowerMEMBARRIER(SDValue Op,
+                                               SelectionDAG& DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other,  Op.getOperand(0));
+}
+
+
+SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op,
+                                                 SelectionDAG& DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+
+SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDValue Result;
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  DebugLoc dl = Op.getDebugLoc();
+  Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+
+  HexagonTargetObjectFile &TLOF =
+    (HexagonTargetObjectFile&)getObjFileLowering();
+  if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
+    return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result);
+  }
+
+  return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
+                                             &targetmachine)
+  : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
+    TM(targetmachine) {
+
+    // Set up the register classes.
+    addRegisterClass(MVT::i32, Hexagon::IntRegsRegisterClass);
+    addRegisterClass(MVT::i64, Hexagon::DoubleRegsRegisterClass);
+
+    addRegisterClass(MVT::i1, Hexagon::PredRegsRegisterClass);
+
+    computeRegisterProperties();
+
+    // Align loop entry
+    setPrefLoopAlignment(4);
+
+    // Limits for inline expansion of memcpy/memmove
+    maxStoresPerMemcpy = 6;
+    maxStoresPerMemmove = 6;
+
+    //
+    // Library calls for unsupported operations
+    //
+    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+
+    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+    setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
+    setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
+    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
+    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
+
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
+    setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
+
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
+    setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
+
+    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
+    setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
+    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+    setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
+
+    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+
+    setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
+    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+    setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3");
+    setOperationAction(ISD::SREM,  MVT::i32, Expand);
+
+    setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3");
+    setOperationAction(ISD::SDIV,  MVT::i64, Expand);
+    setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3");
+    setOperationAction(ISD::SREM,  MVT::i64, Expand);
+
+    setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3");
+    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+
+    setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3");
+    setOperationAction(ISD::UDIV,  MVT::i64, Expand);
+
+    setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
+    setOperationAction(ISD::UREM,  MVT::i32, Expand);
+
+    setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
+    setOperationAction(ISD::UREM,  MVT::i64, Expand);
+
+    setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
+    setOperationAction(ISD::FDIV,  MVT::f32, Expand);
+
+    setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
+    setOperationAction(ISD::FDIV,  MVT::f64, Expand);
+
+    setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
+    setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
+
+    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
+    setOperationAction(ISD::SINT_TO_FP,  MVT::i32, Expand);
+
+    setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+    setOperationAction(ISD::FADD,  MVT::f64, Expand);
+
+    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+    setOperationAction(ISD::FADD,  MVT::f32, Expand);
+
+    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+    setOperationAction(ISD::FADD,  MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+    setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+
+    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+    setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+    setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+    setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+    setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+    setOperationAction(ISD::SREM, MVT::i32, Expand);
+
+    setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+    setOperationAction(ISD::FMUL, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+    setOperationAction(ISD::MUL, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+
+
+    setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+    setOperationAction(ISD::SUB, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+    setOperationAction(ISD::SUB, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
+    setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
+    setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
+    setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+    setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
+    setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
+    setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+
+    setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+    setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+    setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
+    setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal);
+
+    setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal);
+
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+    // Turn FP extload into load/fextend.
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+    // Hexagon has a i1 sign extending load.
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+    // Turn FP truncstore into trunc + store.
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+    // Custom legalize GlobalAddress nodes into CONST32.
+    setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+    setOperationAction(ISD::GlobalAddress, MVT::i8, Custom);
+    // Truncate action?
+    setOperationAction(ISD::TRUNCATE, MVT::i64, Expand);
+
+    // Hexagon doesn't have sext_inreg, replace them with shl/sra.
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+    // Hexagon has no REM or DIVREM operations.
+    setOperationAction(ISD::UREM, MVT::i32, Expand);
+    setOperationAction(ISD::SREM, MVT::i32, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+    // Expand fp<->uint.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+    // Hexagon has no select or setcc: expand to SELECT_CC.
+    setOperationAction(ISD::SELECT, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::f64, Expand);
+
+    // Lower SELECT_CC to SETCC and SELECT.
+    setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::i64,   Custom);
+    // This is a workaround documented in DAGCombiner.cpp:2892 We don't
+    // support SELECT_CC on every type.
+    setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
+
+    setOperationAction(ISD::BR_CC, MVT::Other, Expand);
+    setOperationAction(ISD::BRIND, MVT::Other, Expand);
+    if (EmitJumpTables) {
+      setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+    } else {
+      setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+    }
+
+    setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+    setOperationAction(ISD::FSIN , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::f64, Expand);
+    setOperationAction(ISD::FREM , MVT::f64, Expand);
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+    setOperationAction(ISD::FREM , MVT::f32, Expand);
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+    setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+    setOperationAction(ISD::ROTL , MVT::i32, Expand);
+    setOperationAction(ISD::ROTR , MVT::i32, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FPOW , MVT::f64, Expand);
+    setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+    setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+    setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+    setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+    setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+
+    setOperationAction(ISD::EH_RETURN,     MVT::Other, Expand);
+
+    if (TM.getSubtargetImpl()->isSubtargetV2()) {
+      setExceptionPointerRegister(Hexagon::R20);
+      setExceptionSelectorRegister(Hexagon::R21);
+    } else {
+      setExceptionPointerRegister(Hexagon::R0);
+      setExceptionSelectorRegister(Hexagon::R1);
+    }
+
+    // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+    setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+    // Use the default implementation.
+    setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+    setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+    setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+    setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+    setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+
+
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+    setOperationAction(ISD::INLINEASM         , MVT::Other, Custom);
+
+    setMinFunctionAlignment(2);
+
+    // Needed for DYNAMIC_STACKALLOC expansion.
+    unsigned StackRegister = TM.getRegisterInfo()->getStackRegister();
+    setStackPointerRegisterToSaveRestore(StackRegister);
+}
+
+
+const char*
+HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+    default: return 0;
+    case HexagonISD::CONST32:    return "HexagonISD::CONST32";
+    case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
+    case HexagonISD::CMPICC:     return "HexagonISD::CMPICC";
+    case HexagonISD::CMPFCC:     return "HexagonISD::CMPFCC";
+    case HexagonISD::BRICC:      return "HexagonISD::BRICC";
+    case HexagonISD::BRFCC:      return "HexagonISD::BRFCC";
+    case HexagonISD::SELECT_ICC: return "HexagonISD::SELECT_ICC";
+    case HexagonISD::SELECT_FCC: return "HexagonISD::SELECT_FCC";
+    case HexagonISD::Hi:         return "HexagonISD::Hi";
+    case HexagonISD::Lo:         return "HexagonISD::Lo";
+    case HexagonISD::FTOI:       return "HexagonISD::FTOI";
+    case HexagonISD::ITOF:       return "HexagonISD::ITOF";
+    case HexagonISD::CALL:       return "HexagonISD::CALL";
+    case HexagonISD::RET_FLAG:   return "HexagonISD::RET_FLAG";
+    case HexagonISD::BR_JT:      return "HexagonISD::BR_JT";
+    case HexagonISD::TC_RETURN:  return "HexagonISD::TC_RETURN";
+  }
+}
+
+bool
+HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  EVT MTy1 = EVT::getEVT(Ty1);
+  EVT MTy2 = EVT::getEVT(Ty2);
+  if (!MTy1.isSimple() || !MTy2.isSimple()) {
+    return false;
+  }
+  return ((MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32));
+}
+
+bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isSimple() || !VT2.isSimple()) {
+    return false;
+  }
+  return ((VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32));
+}
+
+SDValue
+HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+    default: assert(0 && "Should not custom lower this!");
+      // Frame & Return address.  Currently unimplemented.
+    case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+    case ISD::FRAMEADDR:  return LowerFRAMEADDR(Op, DAG);
+    case ISD::GlobalTLSAddress:
+                          assert(0 && "TLS not implemented for Hexagon.");
+    case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
+    case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+    case ISD::GlobalAddress:      return LowerGLOBALADDRESS(Op, DAG);
+    case ISD::VASTART:            return LowerVASTART(Op, DAG);
+    case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+
+    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
+
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                           Hexagon Scheduler Hooks
+//===----------------------------------------------------------------------===//
+MachineBasicBlock *
+HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB)
+const {
+  switch (MI->getOpcode()) {
+    case Hexagon::ADJDYNALLOC: {
+      MachineFunction *MF = BB->getParent();
+      HexagonMachineFunctionInfo *FuncInfo =
+        MF->getInfo<HexagonMachineFunctionInfo>();
+      FuncInfo->addAllocaAdjustInst(MI);
+      return BB;
+    }
+    default:
+      assert(false && "Unexpected instr type to insert");
+  } // switch
+  return NULL;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass*>
+HexagonTargetLowering::getRegForInlineAsmConstraint(const
+                                                    std::string &Constraint,
+                                                    EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':   // R0-R31
+       switch (VT.getSimpleVT().SimpleTy) {
+       default:
+         assert(0 && "getRegForInlineAsmConstraint Unhandled data type");
+       case MVT::i32:
+       case MVT::i16:
+       case MVT::i8:
+         return std::make_pair(0U, Hexagon::IntRegsRegisterClass);
+       case MVT::i64:
+         return std::make_pair(0U, Hexagon::DoubleRegsRegisterClass);
+      }
+    default:
+      assert(0 && "Unknown asm register class");
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented by
+/// AM is legal for this target, for a load/store of the specified type.
+bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // Allows a signed-extended 11-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 13) || AM.BaseOffs >= (1LL << 13)-1) {
+    return false;
+  }
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV) {
+    return false;
+  }
+
+  int Scale = AM.Scale;
+  if (Scale < 0) Scale = -Scale;
+  switch (Scale) {
+  case 0:  // No scale reg, "r+i", "r", or just "i".
+    break;
+  default: // No scaled addressing mode.
+    return false;
+  }
+  return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return Imm >= -512 && Imm <= 511;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
+                                 SDValue Callee,
+                                 CallingConv::ID CalleeCC,
+                                 bool isVarArg,
+                                 bool isCalleeStructRet,
+                                 bool isCallerStructRet,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // ***************************************************************************
+  //  Look for obvious safe cases to perform tail call optimization that do not
+  //  require ABI changes.
+  // ***************************************************************************
+
+  // If this is a tail call via a function pointer, then don't do it!
+  if (!(dyn_cast<GlobalAddressSDNode>(Callee))
+      && !(dyn_cast<ExternalSymbolSDNode>(Callee))) {
+    return false;
+  }
+
+  // Do not optimize if the calling conventions do not match.
+  if (!CCMatch)
+    return false;
+
+  // Do not tail call optimize vararg calls.
+  if (isVarArg)
+    return false;
+
+  // Also avoid tail call optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // In addition to the cases above, we also disable Tail Call Optimization if
+  // the calling convention code that at least one outgoing argument needs to
+  // go on the stack. We cannot check that here because at this point that
+  // information is not available.
+  return true;
+}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
new file mode 100644
index 0000000..b327615
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -0,0 +1,162 @@
+//==-- HexagonISelLowering.h - Hexagon DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Hexagon uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Hexagon_ISELLOWERING_H
+#define Hexagon_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "Hexagon.h"
+
+namespace llvm {
+  namespace HexagonISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      CONST32,
+      CONST32_GP,  // For marking data present in GP.
+      SETCC,
+      ADJDYNALLOC,
+      ARGEXTEND,
+
+      CMPICC,      // Compare two GPR operands, set icc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG,    // Return with a flag operand.
+      BR_JT,       // Jump table.
+      BARRIER,     // Memory barrier.
+      WrapperJT,
+      TC_RETURN
+    };
+  }
+
+  class HexagonTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+
+    bool CanReturnSmallStruct(const Function* CalleeFn,
+                              unsigned& RetSize) const;
+
+  public:
+    HexagonTargetMachine &TM;
+    explicit HexagonTargetLowering(HexagonTargetMachine &targetmachine);
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      bool isCalleeStructRet,
+                                      bool isCallerStructRet,
+                                      const
+                                      SmallVectorImpl<ISD::OutputArg> &Outs,
+                                      const SmallVectorImpl<SDValue> &OutVals,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
+
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    SDValue  LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFormalArguments(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 DebugLoc dl, SelectionDAG &DAG,
+                                 SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCall(SDValue Chain, SDValue Callee,
+                      CallingConv::ID CallConv, bool isVarArg,
+                      bool &isTailCall,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                      DebugLoc dl, SelectionDAG &DAG,
+                      SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SmallVectorImpl<SDValue> &OutVals,
+                            SDValue Callee) const;
+
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual MachineBasicBlock
+    *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                 MachineBasicBlock *BB) const;
+
+    SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    virtual EVT getSetCCResultType(EVT VT) const {
+      return MVT::i1;
+    }
+
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDValue &Base, SDValue &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 EVT VT) const;
+
+    // Intrinsics
+    virtual SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                            SelectionDAG &DAG) const;
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    /// The type may be VoidTy, in which case only return true if the addressing
+    /// mode is legal for a load/store of any legal type.
+    /// TODO: Handle pre/postinc as well.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+  };
+} // end namespace llvm
+
+#endif    // Hexagon_ISELLOWERING_H
diff --git a/lib/Target/Hexagon/HexagonImmediates.td b/lib/Target/Hexagon/HexagonImmediates.td
new file mode 100644
index 0000000..1e3fcb8
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonImmediates.td
@@ -0,0 +1,491 @@
+//=- HexagonImmediates.td - Hexagon immediate processing --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illnois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// From IA64's InstrInfo file
+def s32Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s16Imm : Operand<i32> {
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s12Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s11Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s11_0Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s11_1Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s11_2Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s11_3Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s10Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s8Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s9Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s8Imm64 : Operand<i64> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s6Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s4Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s4_0Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s4_1Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s4_2Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def s4_3Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u64Imm : Operand<i64> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u32Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u16Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u16_0Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u16_1Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u16_2Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u11_3Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u10Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u9Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u8Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u7Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u6Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u6_0Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u6_1Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u6_2Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u6_3Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u5Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u4Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u3Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def u2Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def n8Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+def m6Imm : Operand<i32> {
+  // For now, we use a generic print function for all operands.
+  let PrintMethod = "printHexagonImmOperand";
+}
+
+//
+// Immediate predicates
+//
+def s32ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<32>(v);
+}]>;
+
+def s32_24ImmPred  : PatLeaf<(i32 imm), [{
+  // s32_24ImmPred predicate - True if the immediate fits in a 32-bit sign
+  // extended field that is a multiple of 0x1000000.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<32,24>(v);
+}]>;
+
+def s32_16s8ImmPred  : PatLeaf<(i32 imm), [{
+  // s32_16s8ImmPred predicate - True if the immediate fits in a 32-bit sign
+  // extended field that is a multiple of 0x10000.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<24,16>(v);
+}]>;
+
+def s16ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<16>(v);
+}]>;
+
+
+def s13ImmPred  : PatLeaf<(i32 imm), [{
+  // immS13 predicate - True if the immediate fits in a 13-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<13>(v);
+}]>;
+
+
+def s12ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<12>(v);
+}]>;
+
+def s11_0ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<11>(v);
+}]>;
+
+
+def s11_1ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,1>(v);
+}]>;
+
+
+def s11_2ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,2>(v);
+}]>;
+
+
+def s11_3ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,3>(v);
+}]>;
+
+
+def s10ImmPred  : PatLeaf<(i32 imm), [{
+  // s10ImmPred predicate - True if the immediate fits in a 10-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<10>(v);
+}]>;
+
+
+def s9ImmPred  : PatLeaf<(i32 imm), [{
+  // s9ImmPred predicate - True if the immediate fits in a 9-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<9>(v);
+}]>;
+
+
+def s8ImmPred  : PatLeaf<(i32 imm), [{
+  // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+
+def s8Imm64Pred  : PatLeaf<(i64 imm), [{
+  // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+
+def s6ImmPred  : PatLeaf<(i32 imm), [{
+  // s6ImmPred predicate - True if the immediate fits in a 6-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<6>(v);
+}]>;
+
+
+def s4_0ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_0ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<4>(v);
+}]>;
+
+
+def s4_1ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_1ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,1>(v);
+}]>;
+
+
+def s4_2ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_2ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field that is a multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,2>(v);
+}]>;
+
+
+def s4_3ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_3ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field that is a multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,3>(v);
+}]>;
+
+
+def u64ImmPred  : PatLeaf<(i64 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  // Adding "N ||" to supress gcc unused warning.
+  return (N || true);
+}]>;
+
+def u32ImmPred  : PatLeaf<(i32 imm), [{
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<32>(v);
+}]>;
+
+def u16ImmPred  : PatLeaf<(i32 imm), [{
+  // u16ImmPred predicate - True if the immediate fits in a 16-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<16>(v);
+}]>;
+
+def u16_s8ImmPred  : PatLeaf<(i32 imm), [{
+  // u16_s8ImmPred predicate - True if the immediate fits in a 16-bit sign
+  // extended s8 field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<16,8>(v);
+}]>;
+
+def u9ImmPred  : PatLeaf<(i32 imm), [{
+  // u9ImmPred predicate - True if the immediate fits in a 9-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<9>(v);
+}]>;
+
+
+def u8ImmPred  : PatLeaf<(i32 imm), [{
+  // u8ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<8>(v);
+}]>;
+
+def u7ImmPred  : PatLeaf<(i32 imm), [{
+  // u7ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<7>(v);
+}]>;
+
+
+def u6ImmPred  : PatLeaf<(i32 imm), [{
+  // u6ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}]>;
+
+def u6_0ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_0ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field. Same as u6ImmPred.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}]>;
+
+def u6_1ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_1ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field that is 1 bit alinged - multiple of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,1>(v);
+}]>;
+
+def u6_2ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_2ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field that is 2 bits alinged - multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,2>(v);
+}]>;
+
+def u6_3ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_3ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field that is 3 bits alinged - multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,3>(v);
+}]>;
+
+def u5ImmPred  : PatLeaf<(i32 imm), [{
+  // u5ImmPred predicate - True if the immediate fits in a 5-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<5>(v);
+}]>;
+
+
+def u3ImmPred  : PatLeaf<(i32 imm), [{
+  // u3ImmPred predicate - True if the immediate fits in a 3-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<3>(v);
+}]>;
+
+
+def u2ImmPred  : PatLeaf<(i32 imm), [{
+  // u2ImmPred predicate - True if the immediate fits in a 2-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<2>(v);
+}]>;
+
+
+def u1ImmPred  : PatLeaf<(i1 imm), [{
+  // u1ImmPred predicate - True if the immediate fits in a 1-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<1>(v);
+}]>;
+
+def m6ImmPred  : PatLeaf<(i32 imm), [{
+  // m6ImmPred predicate - True if the immediate is negative and fits in
+  // a 6-bit negative number.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<6>(v);
+}]>;
+
+//InN means negative integers in [-(2^N - 1), 0]
+def n8ImmPred  : PatLeaf<(i32 imm), [{
+  // n8ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return (-255 <= v && v <= 0);
+}]>;
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
new file mode 100644
index 0000000..7e92776
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -0,0 +1,242 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
+                  string cstr,
+                InstrItinClass itin> : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "Hexagon";
+
+/* Commented out for Hexagon
+  bits<2> op;
+  let Inst{31-30} = op; */              // Top two bits are the 'op' field
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+  let Constraints = cstr;
+  let Itinerary   = itin;
+}
+
+//----------------------------------------------------------------------------//
+//                         Intruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", LD> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+}
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
+                 string cstr>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr,  LD> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<13> imm13;
+}
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+class STInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "",  ST> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+}
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
+                 string cstr>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr,  ST> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<13> imm13;
+}
+
+// ALU32 Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstHexagon<outs, ins, asmstr, pattern, "",  ALU32> {
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+  bits<16> imm16_2;
+}
+
+// ALU64 Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
+class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstHexagon<outs, ins, asmstr, pattern, "",  ALU64> {
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+  bits<16> imm16_2;
+}
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "",  M> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+}
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
+                string cstr>
+    : InstHexagon<outs, ins, asmstr, pattern, cstr,  M> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+}
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+//: InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, M)> {
+  : InstHexagon<outs, ins, asmstr, pattern, "",  S> {
+//  : InstHexagon<outs, ins, asmstr, pattern, "", S> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+}
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
+                string cstr>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr,  S> {
+//  : InstHexagon<outs, ins, asmstr, pattern, cstr,  S> {
+//  : InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, S)> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+}
+
+// J Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JType<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "",  J> {
+  bits<16> imm16;
+}
+
+// JR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JRType<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", JR> {
+  bits<5>  rs;
+  bits<5>  pu; // Predicate register
+}
+
+// CR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", CR> {
+  bits<5> rs;
+  bits<10> imm10;
+}
+
+
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO>;
+
+
+//----------------------------------------------------------------------------//
+//                         Intruction Classes Definitions -
+//----------------------------------------------------------------------------//
+
+
+//
+// ALU32 patterns
+//.
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : ALU32Type<outs, ins, asmstr, pattern> {
+}
+
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : ALU32Type<outs, ins, asmstr, pattern> {
+   let rt{0-4} = 0;
+}
+
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : ALU32Type<outs, ins, asmstr, pattern> {
+  let rt{0-4} = 0;
+}
+
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : ALU32Type<outs, ins, asmstr, pattern> {
+  let rt{0-4} = 0;
+}
+
+//
+// ALU64 patterns.
+//
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : ALU64Type<outs, ins, asmstr, pattern> {
+}
+
+// J Type Instructions.
+class JInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : JType<outs, ins, asmstr, pattern> {
+}
+
+// JR type Instructions.
+class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : JRType<outs, ins, asmstr, pattern> {
+}
+
+
+// Post increment ST Instruction.
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr>
+  : STInstPost<outs, ins, asmstr, pattern, cstr> {
+  let rt{0-4} = 0;
+}
+
+// Post increment LD Instruction.
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr>
+  : LDInstPost<outs, ins, asmstr, pattern, cstr> {
+  let rt{0-4} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
new file mode 100644
index 0000000..bd5e449
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -0,0 +1,46 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// NV type instructions.
+//
+class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+}
+
+// Definition of Post increment new value store.
+class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
+                    string cstr>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<13> imm13;
+}
+
+// Post increment ST Instruction.
+class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
+                  string cstr>
+  : NVInstPost_V4<outs, ins, asmstr, pattern, cstr> {
+  let rt{0-4} = 0;
+}
+
+class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "",  MEM_V4> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<6> imm6;
+}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
new file mode 100644
index 0000000..69a50d7
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -0,0 +1,1459 @@
+//=- HexagonInstrInfo.cpp - Hexagon Instruction Information -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonRegisterInfo.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "Hexagon.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#define GET_INSTRINFO_CTOR
+#include "HexagonGenInstrInfo.inc"
+
+#include <iostream>
+
+
+using namespace llvm;
+
+///
+/// Constants for Hexagon instructions.
+///
+const int Hexagon_MEMW_OFFSET_MAX = 4095;
+const int Hexagon_MEMW_OFFSET_MIN = 4096;
+const int Hexagon_MEMD_OFFSET_MAX = 8191;
+const int Hexagon_MEMD_OFFSET_MIN = 8192;
+const int Hexagon_MEMH_OFFSET_MAX = 2047;
+const int Hexagon_MEMH_OFFSET_MIN = 2048;
+const int Hexagon_MEMB_OFFSET_MAX = 1023;
+const int Hexagon_MEMB_OFFSET_MIN = 1024;
+const int Hexagon_ADDI_OFFSET_MAX = 32767;
+const int Hexagon_ADDI_OFFSET_MIN = 32768;
+const int Hexagon_MEMD_AUTOINC_MAX = 56;
+const int Hexagon_MEMD_AUTOINC_MIN = 64;
+const int Hexagon_MEMW_AUTOINC_MAX = 28;
+const int Hexagon_MEMW_AUTOINC_MIN = 32;
+const int Hexagon_MEMH_AUTOINC_MAX = 14;
+const int Hexagon_MEMH_AUTOINC_MIN = 16;
+const int Hexagon_MEMB_AUTOINC_MAX = 7;
+const int Hexagon_MEMB_AUTOINC_MIN = 8;
+
+
+
+HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
+  : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
+    RI(ST, *this), Subtarget(ST) {
+}
+
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+
+
+  switch (MI->getOpcode()) {
+  case Hexagon::LDriw:
+  case Hexagon::LDrid:
+  case Hexagon::LDrih:
+  case Hexagon::LDrib:
+  case Hexagon::LDriub:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::STriw:
+  case Hexagon::STrid:
+  case Hexagon::STrih:
+  case Hexagon::STrib:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+
+unsigned
+HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const{
+
+    int BOpc   = Hexagon::JMP;
+    int BccOpc = Hexagon::JMP_Pred;
+
+    assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+    int regPos = 0;
+    // Check if ReverseBranchCondition has asked to reverse this branch
+    // If we want to reverse the branch an odd number of times, we want
+    // JMP_PredNot.
+    if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) {
+      BccOpc = Hexagon::JMP_PredNot;
+      regPos = 1;
+    }
+
+    if (FBB == 0) {
+      if (Cond.empty()) {
+        // Due to a bug in TailMerging/CFG Optimization, we need to add a
+        // special case handling of a predicated jump followed by an
+        // unconditional jump. If not, Tail Merging and CFG Optimization go
+        // into an infinite loop.
+        MachineBasicBlock *NewTBB, *NewFBB;
+        SmallVector<MachineOperand, 4> Cond;
+        MachineInstr *Term = MBB.getFirstTerminator();
+        if (isPredicated(Term) && !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond,
+                                                 false)) {
+          MachineBasicBlock *NextBB =
+            llvm::next(MachineFunction::iterator(&MBB));
+          if (NewTBB == NextBB) {
+            ReverseBranchCondition(Cond);
+            RemoveBranch(MBB);
+            return InsertBranch(MBB, TBB, 0, Cond, DL);
+          }
+        }
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+      } else {
+        BuildMI(&MBB, DL,
+                get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB);
+      }
+      return 1;
+    }
+
+    BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB);
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+
+    return 2;
+}
+
+
+bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  FBB = NULL;
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+
+  // A basic block may looks like this:
+  //
+  //  [   insn
+  //     EH_LABEL
+  //      insn
+  //      insn
+  //      insn
+  //     EH_LABEL
+  //      insn     ]
+  //
+  // It has two succs but does not have a terminator
+  // Don't know how to handle it.
+  do {
+    --I;
+    if (I->isEHLabel())
+      return true;
+  } while (I != MBB.begin());
+
+  I = MBB.end();
+  --I;
+
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == Hexagon::JMP) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (LastInst->getOpcode() == Hexagon::JMP_Pred) {
+      // Block ends with fall-through true condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    if (LastInst->getOpcode() == Hexagon::JMP_PredNot) {
+      // Block ends with fall-through false condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with Hexagon::BRCOND and Hexagon:JMP, handle it.
+  if (((SecondLastInst->getOpcode() == Hexagon::BRCOND) ||
+      (SecondLastInst->getOpcode() == Hexagon::JMP_Pred)) &&
+      LastInst->getOpcode() == Hexagon::JMP) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with Hexagon::JMP_PredNot and Hexagon:JMP, handle it.
+  if ((SecondLastInst->getOpcode() == Hexagon::JMP_PredNot) &&
+      LastInst->getOpcode() == Hexagon::JMP) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two Hexagon:JMPs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == Hexagon::JMP &&
+      LastInst->getOpcode() == Hexagon::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  int BOpc   = Hexagon::JMP;
+  int BccOpc = Hexagon::JMP_Pred;
+  int BccOpcNot = Hexagon::JMP_PredNot;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc &&
+      I->getOpcode() != BccOpcNot)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != BccOpc && I->getOpcode() != BccOpcNot)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+
+void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::DoubleRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR_64), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg, DestReg)) {
+    // Map Pd = Ps to Pd = or(Ps, Ps).
+    BuildMI(MBB, I, DL, get(Hexagon::OR_pp),
+            DestReg).addReg(SrcReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::DoubleRegsRegClass.contains(DestReg, SrcReg)) {
+    // We can have an overlap between single and double reg: r1:0 = r0.
+    if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) {
+        // r1:0 = r0
+        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+                Hexagon::subreg_hireg))).addImm(0);
+    } else {
+        // r1:0 = r1 or no overlap.
+        BuildMI(MBB, I, DL, get(Hexagon::TFR), (RI.getSubReg(DestReg,
+                Hexagon::subreg_loreg))).addReg(SrcReg);
+        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+                Hexagon::subreg_hireg))).addImm(0);
+    }
+    return;
+  }
+  if (Hexagon::CRRegsRegClass.contains(DestReg, SrcReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg);
+    return;
+  } 
+  
+  assert (0 && "Unimplemented");
+}
+
+
+void HexagonInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                      MachineMemOperand::MOStore,
+                      MFI.getObjectSize(FI),
+                      Align);
+
+  if (Hexagon::IntRegsRegisterClass->hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else if (Hexagon::DoubleRegsRegisterClass->hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STrid))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else if (Hexagon::PredRegsRegisterClass->hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else {
+    assert(0 && "Unimplemented");
+  }
+}
+
+
+void HexagonInstrInfo::storeRegToAddr(
+                                 MachineFunction &MF, unsigned SrcReg,
+                                 bool isKill,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const
+{
+  assert(0 && "Unimplemented");
+  return;
+}
+
+
+void HexagonInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                      MachineMemOperand::MOLoad,
+                      MFI.getObjectSize(FI),
+                      Align);
+
+  if (RC == Hexagon::IntRegsRegisterClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (RC == Hexagon::DoubleRegsRegisterClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDrid), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (RC == Hexagon::PredRegsRegisterClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else {
+    assert(0 && "Can't store this register to stack slot");
+  }
+}
+
+
+void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                        SmallVectorImpl<MachineOperand> &Addr,
+                                        const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  assert(0 && "Unimplemented");
+}
+
+
+MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                    MachineInstr* MI,
+                                          const SmallVectorImpl<unsigned> &Ops,
+                                                    int FI) const {
+  // Hexagon_TODO: Implement.
+  return(0);
+}
+
+
+unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *TRC;
+  if (VT == MVT::i1) {
+    TRC =  Hexagon::PredRegsRegisterClass;
+  } else if (VT == MVT::i32) {
+    TRC =  Hexagon::IntRegsRegisterClass;
+  } else if (VT == MVT::i64) {
+    TRC =  Hexagon::DoubleRegsRegisterClass;
+  } else {
+    assert(0 && "Cannot handle this register class");
+  }
+
+  unsigned NewReg = RegInfo.createVirtualRegister(TRC);
+  return NewReg;
+}
+
+
+bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
+  bool isPred = MI->getDesc().isPredicable();
+
+  if (!isPred)
+    return false;
+
+  const int Opc = MI->getOpcode();
+
+  switch(Opc) {
+  case Hexagon::TFRI:
+    return isInt<12>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrid:
+  case Hexagon::STrid_indexed:
+    return isShiftedUInt<6,3>(MI->getOperand(1).getImm());
+
+  case Hexagon::STriw:
+  case Hexagon::STriw_indexed:
+  case Hexagon::STriw_nv_V4:
+    return isShiftedUInt<6,2>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrih:
+  case Hexagon::STrih_indexed:
+  case Hexagon::STrih_nv_V4:
+    return isShiftedUInt<6,1>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrib:
+  case Hexagon::STrib_indexed:
+  case Hexagon::STrib_nv_V4:
+    return isUInt<6>(MI->getOperand(1).getImm());
+
+  case Hexagon::LDrid:
+  case Hexagon::LDrid_indexed:
+    return isShiftedUInt<6,3>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDriw:
+  case Hexagon::LDriw_indexed:
+    return isShiftedUInt<6,2>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDrih:
+  case Hexagon::LDriuh:
+  case Hexagon::LDrih_indexed:
+  case Hexagon::LDriuh_indexed:
+    return isShiftedUInt<6,1>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDrib:
+  case Hexagon::LDriub:
+  case Hexagon::LDrib_indexed:
+  case Hexagon::LDriub_indexed:
+    return isUInt<6>(MI->getOperand(2).getImm());
+
+  case Hexagon::POST_LDrid:
+    return isShiftedInt<4,3>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDriw:
+    return isShiftedInt<4,2>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDrih:
+  case Hexagon::POST_LDriuh:
+    return isShiftedInt<4,1>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDrib:
+  case Hexagon::POST_LDriub:
+    return isInt<4>(MI->getOperand(3).getImm());
+
+  case Hexagon::STrib_imm_V4:
+  case Hexagon::STrih_imm_V4:
+  case Hexagon::STriw_imm_V4:
+    return (isUInt<6>(MI->getOperand(1).getImm()) &&
+            isInt<6>(MI->getOperand(2).getImm()));
+
+  case Hexagon::ADD_ri:
+    return isInt<8>(MI->getOperand(2).getImm());
+
+  case Hexagon::ASLH:
+  case Hexagon::ASRH:
+  case Hexagon::SXTB:
+  case Hexagon::SXTH:
+  case Hexagon::ZXTB:
+  case Hexagon::ZXTH:
+    return Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+
+  case Hexagon::JMPR:
+    return false;
+    return true;
+
+  default:
+    return true;
+  }
+
+  return true;
+}
+
+
+int HexagonInstrInfo::
+getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
+  switch(Opc) {
+  case Hexagon::TFR:
+    return !invertPredicate ? Hexagon::TFR_cPt :
+                              Hexagon::TFR_cNotPt;
+  case Hexagon::TFRI:
+    return !invertPredicate ? Hexagon::TFRI_cPt :
+                              Hexagon::TFRI_cNotPt;
+  case Hexagon::JMP:
+    return !invertPredicate ? Hexagon::JMP_Pred :
+                              Hexagon::JMP_PredNot;
+  case Hexagon::ADD_ri:
+    return !invertPredicate ? Hexagon::ADD_ri_cPt :
+                              Hexagon::ADD_ri_cNotPt;
+  case Hexagon::ADD_rr:
+    return !invertPredicate ? Hexagon::ADD_rr_cPt :
+                              Hexagon::ADD_rr_cNotPt;
+  case Hexagon::XOR_rr:
+    return !invertPredicate ? Hexagon::XOR_rr_cPt :
+                              Hexagon::XOR_rr_cNotPt;
+  case Hexagon::AND_rr:
+    return !invertPredicate ? Hexagon::AND_rr_cPt :
+                              Hexagon::AND_rr_cNotPt;
+  case Hexagon::OR_rr:
+    return !invertPredicate ? Hexagon::OR_rr_cPt :
+                              Hexagon::OR_rr_cNotPt;
+  case Hexagon::SUB_rr:
+    return !invertPredicate ? Hexagon::SUB_rr_cPt :
+                              Hexagon::SUB_rr_cNotPt;
+  case Hexagon::COMBINE_rr:
+    return !invertPredicate ? Hexagon::COMBINE_rr_cPt :
+                              Hexagon::COMBINE_rr_cNotPt;
+  case Hexagon::ASLH:
+    return !invertPredicate ? Hexagon::ASLH_cPt_V4 :
+                              Hexagon::ASLH_cNotPt_V4;
+  case Hexagon::ASRH:
+    return !invertPredicate ? Hexagon::ASRH_cPt_V4 :
+                              Hexagon::ASRH_cNotPt_V4;
+  case Hexagon::SXTB:
+    return !invertPredicate ? Hexagon::SXTB_cPt_V4 :
+                              Hexagon::SXTB_cNotPt_V4;
+  case Hexagon::SXTH:
+    return !invertPredicate ? Hexagon::SXTH_cPt_V4 :
+                              Hexagon::SXTH_cNotPt_V4;
+  case Hexagon::ZXTB:
+    return !invertPredicate ? Hexagon::ZXTB_cPt_V4 :
+                              Hexagon::ZXTB_cNotPt_V4;
+  case Hexagon::ZXTH:
+    return !invertPredicate ? Hexagon::ZXTH_cPt_V4 :
+                              Hexagon::ZXTH_cNotPt_V4;
+
+  case Hexagon::JMPR:
+    return !invertPredicate ? Hexagon::JMPR_cPt :
+                              Hexagon::JMPR_cNotPt;
+
+  // V4 indexed+scaled load.
+  case Hexagon::LDrid_indexed_V4:
+    return !invertPredicate ? Hexagon::LDrid_indexed_cPt_V4 :
+                              Hexagon::LDrid_indexed_cNotPt_V4;
+  case Hexagon::LDrid_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDrid_indexed_shl_cPt_V4 :
+                              Hexagon::LDrid_indexed_shl_cNotPt_V4;
+  case Hexagon::LDrib_indexed_V4:
+    return !invertPredicate ? Hexagon::LDrib_indexed_cPt_V4 :
+                              Hexagon::LDrib_indexed_cNotPt_V4;
+  case Hexagon::LDriub_indexed_V4:
+    return !invertPredicate ? Hexagon::LDriub_indexed_cPt_V4 :
+                              Hexagon::LDriub_indexed_cNotPt_V4;
+  case Hexagon::LDriub_ae_indexed_V4:
+    return !invertPredicate ? Hexagon::LDriub_indexed_cPt_V4 :
+                              Hexagon::LDriub_indexed_cNotPt_V4;
+  case Hexagon::LDrib_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDrib_indexed_shl_cPt_V4 :
+                              Hexagon::LDrib_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriub_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriub_indexed_shl_cPt_V4 :
+                              Hexagon::LDriub_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriub_ae_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriub_indexed_shl_cPt_V4 :
+                              Hexagon::LDriub_indexed_shl_cNotPt_V4;
+  case Hexagon::LDrih_indexed_V4:
+    return !invertPredicate ? Hexagon::LDrih_indexed_cPt_V4 :
+                              Hexagon::LDrih_indexed_cNotPt_V4;
+  case Hexagon::LDriuh_indexed_V4:
+    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt_V4 :
+                              Hexagon::LDriuh_indexed_cNotPt_V4;
+  case Hexagon::LDriuh_ae_indexed_V4:
+    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt_V4 :
+                              Hexagon::LDriuh_indexed_cNotPt_V4;
+  case Hexagon::LDrih_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDrih_indexed_shl_cPt_V4 :
+                              Hexagon::LDrih_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriuh_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriuh_indexed_shl_cPt_V4 :
+                              Hexagon::LDriuh_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriuh_ae_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriuh_indexed_shl_cPt_V4 :
+                              Hexagon::LDriuh_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriw_indexed_V4:
+    return !invertPredicate ? Hexagon::LDriw_indexed_cPt_V4 :
+                              Hexagon::LDriw_indexed_cNotPt_V4;
+  case Hexagon::LDriw_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 :
+                              Hexagon::LDriw_indexed_shl_cNotPt_V4;
+    // Byte.
+  case Hexagon::POST_STbri:
+    return !invertPredicate ? Hexagon::POST_STbri_cPt :
+                              Hexagon::POST_STbri_cNotPt;
+  case Hexagon::STrib:
+    return !invertPredicate ? Hexagon::STrib_cPt :
+                              Hexagon::STrib_cNotPt;
+  case Hexagon::STrib_indexed:
+    return !invertPredicate ? Hexagon::STrib_indexed_cPt :
+                              Hexagon::STrib_indexed_cNotPt;
+  case Hexagon::STrib_imm_V4:
+    return !invertPredicate ? Hexagon::STrib_imm_cPt_V4 :
+                              Hexagon::STrib_imm_cNotPt_V4;
+  case Hexagon::STrib_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STrib_indexed_shl_cPt_V4 :
+                              Hexagon::STrib_indexed_shl_cNotPt_V4;
+  // Halfword.
+  case Hexagon::POST_SThri:
+    return !invertPredicate ? Hexagon::POST_SThri_cPt :
+                              Hexagon::POST_SThri_cNotPt;
+  case Hexagon::STrih:
+    return !invertPredicate ? Hexagon::STrih_cPt :
+                              Hexagon::STrih_cNotPt;
+  case Hexagon::STrih_indexed:
+    return !invertPredicate ? Hexagon::STrih_indexed_cPt :
+                              Hexagon::STrih_indexed_cNotPt;
+  case Hexagon::STrih_imm_V4:
+    return !invertPredicate ? Hexagon::STrih_imm_cPt_V4 :
+                              Hexagon::STrih_imm_cNotPt_V4;
+  case Hexagon::STrih_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STrih_indexed_shl_cPt_V4 :
+                              Hexagon::STrih_indexed_shl_cNotPt_V4;
+  // Word.
+  case Hexagon::POST_STwri:
+    return !invertPredicate ? Hexagon::POST_STwri_cPt :
+                              Hexagon::POST_STwri_cNotPt;
+  case Hexagon::STriw:
+    return !invertPredicate ? Hexagon::STriw_cPt :
+                              Hexagon::STriw_cNotPt;
+  case Hexagon::STriw_indexed:
+    return !invertPredicate ? Hexagon::STriw_indexed_cPt :
+                              Hexagon::STriw_indexed_cNotPt;
+  case Hexagon::STriw_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STriw_indexed_shl_cPt_V4 :
+                              Hexagon::STriw_indexed_shl_cNotPt_V4;
+  case Hexagon::STriw_imm_V4:
+    return !invertPredicate ? Hexagon::STriw_imm_cPt_V4 :
+                              Hexagon::STriw_imm_cNotPt_V4;
+  // Double word.
+  case Hexagon::POST_STdri:
+    return !invertPredicate ? Hexagon::POST_STdri_cPt :
+                              Hexagon::POST_STdri_cNotPt;
+  case Hexagon::STrid:
+    return !invertPredicate ? Hexagon::STrid_cPt :
+                              Hexagon::STrid_cNotPt;
+  case Hexagon::STrid_indexed:
+    return !invertPredicate ? Hexagon::STrid_indexed_cPt :
+                              Hexagon::STrid_indexed_cNotPt;
+  case Hexagon::STrid_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STrid_indexed_shl_cPt_V4 :
+                              Hexagon::STrid_indexed_shl_cNotPt_V4;
+  // Load.
+  case Hexagon::LDrid:
+    return !invertPredicate ? Hexagon::LDrid_cPt :
+                              Hexagon::LDrid_cNotPt;
+  case Hexagon::LDriw:
+    return !invertPredicate ? Hexagon::LDriw_cPt :
+                              Hexagon::LDriw_cNotPt;
+  case Hexagon::LDrih:
+    return !invertPredicate ? Hexagon::LDrih_cPt :
+                              Hexagon::LDrih_cNotPt;
+  case Hexagon::LDriuh:
+    return !invertPredicate ? Hexagon::LDriuh_cPt :
+                              Hexagon::LDriuh_cNotPt;
+  case Hexagon::LDrib:
+    return !invertPredicate ? Hexagon::LDrib_cPt :
+                              Hexagon::LDrib_cNotPt;
+  case Hexagon::LDriub:
+    return !invertPredicate ? Hexagon::LDriub_cPt :
+                              Hexagon::LDriub_cNotPt;
+  case Hexagon::LDriubit:
+    return !invertPredicate ? Hexagon::LDriub_cPt :
+                              Hexagon::LDriub_cNotPt;
+ // Load Indexed.
+  case Hexagon::LDrid_indexed:
+    return !invertPredicate ? Hexagon::LDrid_indexed_cPt :
+                              Hexagon::LDrid_indexed_cNotPt;
+  case Hexagon::LDriw_indexed:
+    return !invertPredicate ? Hexagon::LDriw_indexed_cPt :
+                              Hexagon::LDriw_indexed_cNotPt;
+  case Hexagon::LDrih_indexed:
+    return !invertPredicate ? Hexagon::LDrih_indexed_cPt :
+                              Hexagon::LDrih_indexed_cNotPt;
+  case Hexagon::LDriuh_indexed:
+    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt :
+                              Hexagon::LDriuh_indexed_cNotPt;
+  case Hexagon::LDrib_indexed:
+    return !invertPredicate ? Hexagon::LDrib_indexed_cPt :
+                              Hexagon::LDrib_indexed_cNotPt;
+  case Hexagon::LDriub_indexed:
+    return !invertPredicate ? Hexagon::LDriub_indexed_cPt :
+                              Hexagon::LDriub_indexed_cNotPt;
+  // Post Increment Load.
+  case Hexagon::POST_LDrid:
+    return !invertPredicate ? Hexagon::POST_LDrid_cPt :
+                              Hexagon::POST_LDrid_cNotPt;
+  case Hexagon::POST_LDriw:
+    return !invertPredicate ? Hexagon::POST_LDriw_cPt :
+                              Hexagon::POST_LDriw_cNotPt;
+  case Hexagon::POST_LDrih:
+    return !invertPredicate ? Hexagon::POST_LDrih_cPt :
+                              Hexagon::POST_LDrih_cNotPt;
+  case Hexagon::POST_LDriuh:
+    return !invertPredicate ? Hexagon::POST_LDriuh_cPt :
+                              Hexagon::POST_LDriuh_cNotPt;
+  case Hexagon::POST_LDrib:
+    return !invertPredicate ? Hexagon::POST_LDrib_cPt :
+                              Hexagon::POST_LDrib_cNotPt;
+  case Hexagon::POST_LDriub:
+    return !invertPredicate ? Hexagon::POST_LDriub_cPt :
+                              Hexagon::POST_LDriub_cNotPt;
+  // DEALLOC_RETURN.
+  case Hexagon::DEALLOC_RET_V4:
+    return !invertPredicate ? Hexagon::DEALLOC_RET_cPt_V4 :
+                              Hexagon::DEALLOC_RET_cNotPt_V4;
+  default:
+    assert(false && "Unexpected predicable instruction");
+  }
+}
+
+
+bool HexagonInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Cond) const {
+  int Opc = MI->getOpcode();
+  assert (isPredicable(MI) && "Expected predicable instruction");
+  bool invertJump = (!Cond.empty() && Cond[0].isImm() &&
+                     (Cond[0].getImm() == 0));
+  MI->setDesc(get(getMatchingCondBranchOpcode(Opc, invertJump)));
+  //
+  // This assumes that the predicate is always the first operand
+  // in the set of inputs.
+  //
+  MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
+  int oper;
+  for (oper = MI->getNumOperands() - 3; oper >= 0; --oper) {
+    MachineOperand MO = MI->getOperand(oper);
+    if ((MO.isReg() && !MO.isUse() && !MO.isImplicit())) {
+      break;
+    }
+
+    if (MO.isReg()) {
+      MI->getOperand(oper+1).ChangeToRegister(MO.getReg(), MO.isDef(),
+                                              MO.isImplicit(), MO.isKill(),
+                                              MO.isDead(), MO.isUndef(),
+                                              MO.isDebug());
+    } else if (MO.isImm()) {
+      MI->getOperand(oper+1).ChangeToImmediate(MO.getImm());
+    } else {
+      assert(false && "Unexpected operand type");
+    }
+  }
+
+  int regPos = invertJump ? 1 : 0;
+  MachineOperand PredMO = Cond[regPos];
+  MI->getOperand(oper+1).ChangeToRegister(PredMO.getReg(), PredMO.isDef(),
+                                          PredMO.isImplicit(), PredMO.isKill(),
+                                          PredMO.isDead(), PredMO.isUndef(),
+                                          PredMO.isDebug());
+
+  return true;
+}
+
+
+bool
+HexagonInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCyles,
+                    unsigned ExtraPredCycles,
+                    const BranchProbability &Probability) const {
+  return true;
+}
+
+
+bool
+HexagonInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned NumTCycles,
+                    unsigned ExtraTCycles,
+                    MachineBasicBlock &FMBB,
+                    unsigned NumFCycles,
+                    unsigned ExtraFCycles,
+                    const BranchProbability &Probability) const {
+  return true;
+}
+
+
+bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::TFR_cPt:
+  case Hexagon::TFR_cNotPt:
+  case Hexagon::TFRI_cPt:
+  case Hexagon::TFRI_cNotPt:
+  case Hexagon::TFR_cdnPt:
+  case Hexagon::TFR_cdnNotPt:
+  case Hexagon::TFRI_cdnPt:
+  case Hexagon::TFRI_cdnNotPt:
+    return true;
+
+  case Hexagon::JMP_Pred:
+  case Hexagon::JMP_PredNot:
+  case Hexagon::BRCOND:
+  case Hexagon::JMP_PredPt:
+  case Hexagon::JMP_PredNotPt:
+  case Hexagon::JMP_PredPnt:
+  case Hexagon::JMP_PredNotPnt:
+    return true;
+
+  case Hexagon::LDrid_indexed_cPt_V4 :
+  case Hexagon::LDrid_indexed_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_cNotPt_V4 :
+  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_cPt_V4 :
+  case Hexagon::LDrib_indexed_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_cNotPt_V4 :
+  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_cPt_V4 :
+  case Hexagon::LDriub_indexed_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_cNotPt_V4 :
+  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_cPt_V4 :
+  case Hexagon::LDrih_indexed_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_cNotPt_V4 :
+  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_cNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_cPt_V4 :
+  case Hexagon::LDriw_indexed_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_cNotPt_V4 :
+  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+    return true;
+
+  case Hexagon::LDrid_cPt :
+  case Hexagon::LDrid_cNotPt :
+  case Hexagon::LDrid_indexed_cPt :
+  case Hexagon::LDrid_indexed_cNotPt :
+  case Hexagon::POST_LDrid_cPt :
+  case Hexagon::POST_LDrid_cNotPt :
+  case Hexagon::LDriw_cPt :
+  case Hexagon::LDriw_cNotPt :
+  case Hexagon::LDriw_indexed_cPt :
+  case Hexagon::LDriw_indexed_cNotPt :
+  case Hexagon::POST_LDriw_cPt :
+  case Hexagon::POST_LDriw_cNotPt :
+  case Hexagon::LDrih_cPt :
+  case Hexagon::LDrih_cNotPt :
+  case Hexagon::LDrih_indexed_cPt :
+  case Hexagon::LDrih_indexed_cNotPt :
+  case Hexagon::POST_LDrih_cPt :
+  case Hexagon::POST_LDrih_cNotPt :
+  case Hexagon::LDrib_cPt :
+  case Hexagon::LDrib_cNotPt :
+  case Hexagon::LDrib_indexed_cPt :
+  case Hexagon::LDrib_indexed_cNotPt :
+  case Hexagon::POST_LDrib_cPt :
+  case Hexagon::POST_LDrib_cNotPt :
+  case Hexagon::LDriuh_cPt :
+  case Hexagon::LDriuh_cNotPt :
+  case Hexagon::LDriuh_indexed_cPt :
+  case Hexagon::LDriuh_indexed_cNotPt :
+  case Hexagon::POST_LDriuh_cPt :
+  case Hexagon::POST_LDriuh_cNotPt :
+  case Hexagon::LDriub_cPt :
+  case Hexagon::LDriub_cNotPt :
+  case Hexagon::LDriub_indexed_cPt :
+  case Hexagon::LDriub_indexed_cNotPt :
+  case Hexagon::POST_LDriub_cPt :
+  case Hexagon::POST_LDriub_cNotPt :
+    return true;
+
+  case Hexagon::LDrid_cdnPt :
+  case Hexagon::LDrid_cdnNotPt :
+  case Hexagon::LDrid_indexed_cdnPt :
+  case Hexagon::LDrid_indexed_cdnNotPt :
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+  case Hexagon::LDriw_cdnPt :
+  case Hexagon::LDriw_cdnNotPt :
+  case Hexagon::LDriw_indexed_cdnPt :
+  case Hexagon::LDriw_indexed_cdnNotPt :
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+  case Hexagon::LDrih_cdnPt :
+  case Hexagon::LDrih_cdnNotPt :
+  case Hexagon::LDrih_indexed_cdnPt :
+  case Hexagon::LDrih_indexed_cdnNotPt :
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+  case Hexagon::LDrib_cdnPt :
+  case Hexagon::LDrib_cdnNotPt :
+  case Hexagon::LDrib_indexed_cdnPt :
+  case Hexagon::LDrib_indexed_cdnNotPt :
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+  case Hexagon::LDriuh_cdnPt :
+  case Hexagon::LDriuh_cdnNotPt :
+  case Hexagon::LDriuh_indexed_cdnPt :
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+  case Hexagon::LDriub_cdnPt :
+  case Hexagon::LDriub_cdnNotPt :
+  case Hexagon::LDriub_indexed_cdnPt :
+  case Hexagon::LDriub_indexed_cdnNotPt :
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+    return true;
+
+  case Hexagon::ADD_ri_cPt:
+  case Hexagon::ADD_ri_cNotPt:
+  case Hexagon::ADD_ri_cdnPt:
+  case Hexagon::ADD_ri_cdnNotPt:
+  case Hexagon::ADD_rr_cPt:
+  case Hexagon::ADD_rr_cNotPt:
+  case Hexagon::ADD_rr_cdnPt:
+  case Hexagon::ADD_rr_cdnNotPt:
+  case Hexagon::XOR_rr_cPt:
+  case Hexagon::XOR_rr_cNotPt:
+  case Hexagon::XOR_rr_cdnPt:
+  case Hexagon::XOR_rr_cdnNotPt:
+  case Hexagon::AND_rr_cPt:
+  case Hexagon::AND_rr_cNotPt:
+  case Hexagon::AND_rr_cdnPt:
+  case Hexagon::AND_rr_cdnNotPt:
+  case Hexagon::OR_rr_cPt:
+  case Hexagon::OR_rr_cNotPt:
+  case Hexagon::OR_rr_cdnPt:
+  case Hexagon::OR_rr_cdnNotPt:
+  case Hexagon::SUB_rr_cPt:
+  case Hexagon::SUB_rr_cNotPt:
+  case Hexagon::SUB_rr_cdnPt:
+  case Hexagon::SUB_rr_cdnNotPt:
+  case Hexagon::COMBINE_rr_cPt:
+  case Hexagon::COMBINE_rr_cNotPt:
+  case Hexagon::COMBINE_rr_cdnPt:
+  case Hexagon::COMBINE_rr_cdnNotPt:
+    return true;
+
+  case Hexagon::ASLH_cPt_V4:
+  case Hexagon::ASLH_cNotPt_V4:
+  case Hexagon::ASRH_cPt_V4:
+  case Hexagon::ASRH_cNotPt_V4:
+  case Hexagon::SXTB_cPt_V4:
+  case Hexagon::SXTB_cNotPt_V4:
+  case Hexagon::SXTH_cPt_V4:
+  case Hexagon::SXTH_cNotPt_V4:
+  case Hexagon::ZXTB_cPt_V4:
+  case Hexagon::ZXTB_cNotPt_V4:
+  case Hexagon::ZXTH_cPt_V4:
+  case Hexagon::ZXTH_cNotPt_V4:
+    return true;
+
+  case Hexagon::ASLH_cdnPt_V4:
+  case Hexagon::ASLH_cdnNotPt_V4:
+  case Hexagon::ASRH_cdnPt_V4:
+  case Hexagon::ASRH_cdnNotPt_V4:
+  case Hexagon::SXTB_cdnPt_V4:
+  case Hexagon::SXTB_cdnNotPt_V4:
+  case Hexagon::SXTH_cdnPt_V4:
+  case Hexagon::SXTH_cdnNotPt_V4:
+  case Hexagon::ZXTB_cdnPt_V4:
+  case Hexagon::ZXTB_cdnNotPt_V4:
+  case Hexagon::ZXTH_cdnPt_V4:
+  case Hexagon::ZXTH_cdnNotPt_V4:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+
+bool
+HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                   std::vector<MachineOperand> &Pred) const {
+  for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
+    MachineOperand MO = MI->getOperand(oper);
+    if (MO.isReg() && MO.isDef()) {
+      const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg());
+      if (RC == Hexagon::PredRegsRegisterClass) {
+        Pred.push_back(MO);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+bool
+HexagonInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  // TODO: Fix this
+  return false;
+}
+
+
+//
+// We indicate that we want to reverse the branch by
+// inserting a 0 at the beginning of the Cond vector.
+//
+bool HexagonInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) {
+    Cond.erase(Cond.begin());
+  } else {
+    Cond.insert(Cond.begin(), MachineOperand::CreateImm(0));
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
+                          const BranchProbability &Probability) const {
+  return (NumInstrs <= 4);
+}
+
+bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::DEALLOC_RET_V4 :
+  case Hexagon::DEALLOC_RET_cPt_V4 :
+  case Hexagon::DEALLOC_RET_cNotPt_V4 :
+  case Hexagon::DEALLOC_RET_cdnPnt_V4 :
+  case Hexagon::DEALLOC_RET_cNotdnPnt_V4 :
+  case Hexagon::DEALLOC_RET_cdnPt_V4 :
+  case Hexagon::DEALLOC_RET_cNotdnPt_V4 :
+   return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isValidOffset(const int Opcode, const int Offset) const {
+  // This function is to check whether the "Offset" is in the correct range of
+  // the given "Opcode". If "Offset" is not in the correct range, "ADD_ri" is
+  // inserted to calculate the final address. Due to this reason, the function
+  // assumes that the "Offset" has correct alignment.
+
+  switch(Opcode) {
+
+  case Hexagon::LDriw:
+  case Hexagon::STriw:
+  case Hexagon::STriwt:
+    assert((Offset % 4 == 0) && "Offset has incorrect alignment");
+    return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMW_OFFSET_MAX);
+
+  case Hexagon::LDrid:
+  case Hexagon::STrid:
+    assert((Offset % 8 == 0) && "Offset has incorrect alignment");
+    return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMD_OFFSET_MAX);
+
+  case Hexagon::LDrih:
+  case Hexagon::LDriuh:
+  case Hexagon::STrih:
+  case Hexagon::LDrih_ae:
+    assert((Offset % 2 == 0) && "Offset has incorrect alignment");
+    return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMH_OFFSET_MAX);
+
+  case Hexagon::LDrib:
+  case Hexagon::STrib:
+  case Hexagon::LDriub:
+  case Hexagon::LDriubit:
+  case Hexagon::LDrib_ae:
+  case Hexagon::LDriub_ae:
+    return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMB_OFFSET_MAX);
+
+  case Hexagon::ADD_ri:
+  case Hexagon::TFR_FI:
+    return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
+      (Offset <= Hexagon_ADDI_OFFSET_MAX);
+
+  case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 :
+  case Hexagon::MEMw_ADDi_indexed_MEM_V4 :
+  case Hexagon::MEMw_SUBi_indexed_MEM_V4 :
+  case Hexagon::MEMw_ADDr_indexed_MEM_V4 :
+  case Hexagon::MEMw_SUBr_indexed_MEM_V4 :
+  case Hexagon::MEMw_ANDr_indexed_MEM_V4 :
+  case Hexagon::MEMw_ORr_indexed_MEM_V4 :
+  case Hexagon::MEMw_ADDSUBi_MEM_V4 :
+  case Hexagon::MEMw_ADDi_MEM_V4 :
+  case Hexagon::MEMw_SUBi_MEM_V4 :
+  case Hexagon::MEMw_ADDr_MEM_V4 :
+  case Hexagon::MEMw_SUBr_MEM_V4 :
+  case Hexagon::MEMw_ANDr_MEM_V4 :
+  case Hexagon::MEMw_ORr_MEM_V4 :
+    assert ((Offset % 4) == 0 && "MEMOPw offset is not aligned correctly." );
+    return (0 <= Offset && Offset <= 255);
+
+  case Hexagon::MEMh_ADDSUBi_indexed_MEM_V4 :
+  case Hexagon::MEMh_ADDi_indexed_MEM_V4 :
+  case Hexagon::MEMh_SUBi_indexed_MEM_V4 :
+  case Hexagon::MEMh_ADDr_indexed_MEM_V4 :
+  case Hexagon::MEMh_SUBr_indexed_MEM_V4 :
+  case Hexagon::MEMh_ANDr_indexed_MEM_V4 :
+  case Hexagon::MEMh_ORr_indexed_MEM_V4 :
+  case Hexagon::MEMh_ADDSUBi_MEM_V4 :
+  case Hexagon::MEMh_ADDi_MEM_V4 :
+  case Hexagon::MEMh_SUBi_MEM_V4 :
+  case Hexagon::MEMh_ADDr_MEM_V4 :
+  case Hexagon::MEMh_SUBr_MEM_V4 :
+  case Hexagon::MEMh_ANDr_MEM_V4 :
+  case Hexagon::MEMh_ORr_MEM_V4 :
+    assert ((Offset % 2) == 0 && "MEMOPh offset is not aligned correctly." );
+    return (0 <= Offset && Offset <= 127);
+
+  case Hexagon::MEMb_ADDSUBi_indexed_MEM_V4 :
+  case Hexagon::MEMb_ADDi_indexed_MEM_V4 :
+  case Hexagon::MEMb_SUBi_indexed_MEM_V4 :
+  case Hexagon::MEMb_ADDr_indexed_MEM_V4 :
+  case Hexagon::MEMb_SUBr_indexed_MEM_V4 :
+  case Hexagon::MEMb_ANDr_indexed_MEM_V4 :
+  case Hexagon::MEMb_ORr_indexed_MEM_V4 :
+  case Hexagon::MEMb_ADDSUBi_MEM_V4 :
+  case Hexagon::MEMb_ADDi_MEM_V4 :
+  case Hexagon::MEMb_SUBi_MEM_V4 :
+  case Hexagon::MEMb_ADDr_MEM_V4 :
+  case Hexagon::MEMb_SUBr_MEM_V4 :
+  case Hexagon::MEMb_ANDr_MEM_V4 :
+  case Hexagon::MEMb_ORr_MEM_V4 :
+    return (0 <= Offset && Offset <= 63);
+
+  // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of
+  // any size. Later pass knows how to handle it.
+  case Hexagon::STriw_pred:
+  case Hexagon::LDriw_pred:
+    return true;
+
+  // INLINEASM is very special.
+  case Hexagon::INLINEASM:
+    return true;
+  }
+
+  assert(0 && "No offset range is defined for this opcode. Please define it in \
+               the above switch statement!");
+}
+
+
+//
+// Check if the Offset is a valid auto-inc imm by Load/Store Type.
+//
+bool HexagonInstrInfo::
+isValidAutoIncImm(const EVT VT, const int Offset) const {
+
+  if (VT == MVT::i64) {
+      return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMD_AUTOINC_MAX &&
+              (Offset & 0x7) == 0);
+  }
+  if (VT == MVT::i32) {
+      return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMW_AUTOINC_MAX &&
+              (Offset & 0x3) == 0);
+  }
+  if (VT == MVT::i16) {
+      return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMH_AUTOINC_MAX &&
+              (Offset & 0x1) == 0);
+  }
+  if (VT == MVT::i8) {
+      return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMB_AUTOINC_MAX);
+  }
+  
+  assert(0 && "Not an auto-inc opc!");
+
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isMemOp(const MachineInstr *MI) const {
+  switch (MI->getOpcode())
+  {
+    case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMw_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMw_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMw_ADDi_MEM_V4 :
+    case Hexagon::MEMw_SUBi_MEM_V4 :
+    case Hexagon::MEMw_ADDr_MEM_V4 :
+    case Hexagon::MEMw_SUBr_MEM_V4 :
+    case Hexagon::MEMw_ANDr_MEM_V4 :
+    case Hexagon::MEMw_ORr_MEM_V4 :
+    case Hexagon::MEMh_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMh_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMh_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMh_ADDi_MEM_V4 :
+    case Hexagon::MEMh_SUBi_MEM_V4 :
+    case Hexagon::MEMh_ADDr_MEM_V4 :
+    case Hexagon::MEMh_SUBr_MEM_V4 :
+    case Hexagon::MEMh_ANDr_MEM_V4 :
+    case Hexagon::MEMh_ORr_MEM_V4 :
+    case Hexagon::MEMb_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMb_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMb_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMb_ADDi_MEM_V4 :
+    case Hexagon::MEMb_SUBi_MEM_V4 :
+    case Hexagon::MEMb_ADDr_MEM_V4 :
+    case Hexagon::MEMb_SUBr_MEM_V4 :
+    case Hexagon::MEMb_ANDr_MEM_V4 :
+    case Hexagon::MEMb_ORr_MEM_V4 :
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isSpillPredRegOp(const MachineInstr *MI) const {
+  switch (MI->getOpcode())
+  {
+    case Hexagon::STriw_pred :
+    case Hexagon::LDriw_pred :
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    case Hexagon::ADD_ri_cPt:
+    case Hexagon::ADD_ri_cNotPt:
+    case Hexagon::ADD_rr_cPt:
+    case Hexagon::ADD_rr_cNotPt:
+    case Hexagon::XOR_rr_cPt:
+    case Hexagon::XOR_rr_cNotPt:
+    case Hexagon::AND_rr_cPt:
+    case Hexagon::AND_rr_cNotPt:
+    case Hexagon::OR_rr_cPt:
+    case Hexagon::OR_rr_cNotPt:
+    case Hexagon::SUB_rr_cPt:
+    case Hexagon::SUB_rr_cNotPt:
+    case Hexagon::COMBINE_rr_cPt:
+    case Hexagon::COMBINE_rr_cNotPt:
+      return true;
+    case Hexagon::ASLH_cPt_V4:
+    case Hexagon::ASLH_cNotPt_V4:
+    case Hexagon::ASRH_cPt_V4:
+    case Hexagon::ASRH_cNotPt_V4:
+    case Hexagon::SXTB_cPt_V4:
+    case Hexagon::SXTB_cNotPt_V4:
+    case Hexagon::SXTH_cPt_V4:
+    case Hexagon::SXTH_cNotPt_V4:
+    case Hexagon::ZXTB_cPt_V4:
+    case Hexagon::ZXTB_cNotPt_V4:
+    case Hexagon::ZXTH_cPt_V4:
+    case Hexagon::ZXTH_cNotPt_V4:
+      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+
+    default:
+      return false;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isConditionalLoad (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    case Hexagon::LDrid_cPt :
+    case Hexagon::LDrid_cNotPt :
+    case Hexagon::LDrid_indexed_cPt :
+    case Hexagon::LDrid_indexed_cNotPt :
+    case Hexagon::LDriw_cPt :
+    case Hexagon::LDriw_cNotPt :
+    case Hexagon::LDriw_indexed_cPt :
+    case Hexagon::LDriw_indexed_cNotPt :
+    case Hexagon::LDrih_cPt :
+    case Hexagon::LDrih_cNotPt :
+    case Hexagon::LDrih_indexed_cPt :
+    case Hexagon::LDrih_indexed_cNotPt :
+    case Hexagon::LDrib_cPt :
+    case Hexagon::LDrib_cNotPt :
+    case Hexagon::LDrib_indexed_cPt :
+    case Hexagon::LDrib_indexed_cNotPt :
+    case Hexagon::LDriuh_cPt :
+    case Hexagon::LDriuh_cNotPt :
+    case Hexagon::LDriuh_indexed_cPt :
+    case Hexagon::LDriuh_indexed_cNotPt :
+    case Hexagon::LDriub_cPt :
+    case Hexagon::LDriub_cNotPt :
+    case Hexagon::LDriub_indexed_cPt :
+    case Hexagon::LDriub_indexed_cNotPt :
+      return true;
+    case Hexagon::POST_LDrid_cPt :
+    case Hexagon::POST_LDrid_cNotPt :
+    case Hexagon::POST_LDriw_cPt :
+    case Hexagon::POST_LDriw_cNotPt :
+    case Hexagon::POST_LDrih_cPt :
+    case Hexagon::POST_LDrih_cNotPt :
+    case Hexagon::POST_LDrib_cPt :
+    case Hexagon::POST_LDrib_cNotPt :
+    case Hexagon::POST_LDriuh_cPt :
+    case Hexagon::POST_LDriuh_cNotPt :
+    case Hexagon::POST_LDriub_cPt :
+    case Hexagon::POST_LDriub_cNotPt :
+      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+    case Hexagon::LDrid_indexed_cPt_V4 :
+    case Hexagon::LDrid_indexed_cNotPt_V4 :
+    case Hexagon::LDrid_indexed_shl_cPt_V4 :
+    case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDrib_indexed_cPt_V4 :
+    case Hexagon::LDrib_indexed_cNotPt_V4 :
+    case Hexagon::LDrib_indexed_shl_cPt_V4 :
+    case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriub_indexed_cPt_V4 :
+    case Hexagon::LDriub_indexed_cNotPt_V4 :
+    case Hexagon::LDriub_indexed_shl_cPt_V4 :
+    case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDrih_indexed_cPt_V4 :
+    case Hexagon::LDrih_indexed_cNotPt_V4 :
+    case Hexagon::LDrih_indexed_shl_cPt_V4 :
+    case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriuh_indexed_cPt_V4 :
+    case Hexagon::LDriuh_indexed_cNotPt_V4 :
+    case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+    case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriw_indexed_cPt_V4 :
+    case Hexagon::LDriw_indexed_cNotPt_V4 :
+    case Hexagon::LDriw_indexed_shl_cPt_V4 :
+    case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+    default:
+      return false;
+  }
+  return false;
+}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
new file mode 100644
index 0000000..d549c46
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -0,0 +1,166 @@
+//=- HexagonInstrInfo.h - Hexagon Instruction Information ---------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonINSTRUCTIONINFO_H
+#define HexagonINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "HexagonRegisterInfo.h"
+
+
+#define GET_INSTRINFO_HEADER
+#include "HexagonGenInstrInfo.inc"
+
+namespace llvm {
+
+class HexagonInstrInfo : public HexagonGenInstrInfo {
+  const HexagonRegisterInfo RI;
+  const HexagonSubtarget& Subtarget;
+public:
+  explicit HexagonInstrInfo(HexagonSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const;
+
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+
+  unsigned createVR(MachineFunction* MF, MVT VT) const;
+
+  virtual bool isPredicable(MachineInstr *MI) const;
+  virtual bool
+  PredicateInstruction(MachineInstr *MI,
+                       const SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                                   unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumTCycles, unsigned ExtraTCycles,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumFCycles, unsigned ExtraFCycles,
+                                   const BranchProbability &Probability) const;
+
+  virtual bool isPredicated(const MachineInstr *MI) const;
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+  virtual bool
+  SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                    const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual bool
+  isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumCycles,
+                            const BranchProbability &Probability) const;
+
+  bool isValidOffset(const int Opcode, const int Offset) const;
+  bool isValidAutoIncImm(const EVT VT, const int Offset) const;
+  bool isMemOp(const MachineInstr *MI) const;
+  bool isSpillPredRegOp(const MachineInstr *MI) const;
+  bool isU6_3Immediate(const int value) const;
+  bool isU6_2Immediate(const int value) const;
+  bool isU6_1Immediate(const int value) const;
+  bool isU6_0Immediate(const int value) const;
+  bool isS4_3Immediate(const int value) const;
+  bool isS4_2Immediate(const int value) const;
+  bool isS4_1Immediate(const int value) const;
+  bool isS4_0Immediate(const int value) const;
+  bool isS12_Immediate(const int value) const;
+  bool isU6_Immediate(const int value) const;
+  bool isS8_Immediate(const int value) const;
+  bool isS6_Immediate(const int value) const;
+
+  bool isConditionalALU32 (const MachineInstr* MI) const;
+  bool isConditionalLoad (const MachineInstr* MI) const;
+  bool isDeallocRet(const MachineInstr *MI) const;
+
+private:
+  int getMatchingCondBranchOpcode(int Opc, bool sense) const;
+
+};
+
+}
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
new file mode 100644
index 0000000..cc508b7
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -0,0 +1,3014 @@
+//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormats.td"
+include "HexagonImmediates.td"
+
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasV2T                      : Predicate<"Subtarget.hasV2TOps()">;
+def HasV2TOnly                  : Predicate<"Subtarget.hasV2TOpsOnly()">;
+def NoV2T                       : Predicate<"!Subtarget.hasV2TOps()">;
+def HasV3T                      : Predicate<"Subtarget.hasV3TOps()">;
+def HasV3TOnly                  : Predicate<"Subtarget.hasV3TOpsOnly()">;
+def NoV3T                       : Predicate<"!Subtarget.hasV3TOps()">;
+def HasV4T                      : Predicate<"Subtarget.hasV4TOps()">;
+def NoV4T                       : Predicate<"!Subtarget.hasV4TOps()">;
+def UseMEMOP                    : Predicate<"Subtarget.useMemOps()">;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>;
+def ADDRriS11_0 : ComplexPattern<i32, 2, "SelectADDRriS11_0", [frameindex], []>;
+def ADDRriS11_1 : ComplexPattern<i32, 2, "SelectADDRriS11_1", [frameindex], []>;
+def ADDRriS11_2 : ComplexPattern<i32, 2, "SelectADDRriS11_2", [frameindex], []>;
+def ADDRriS11_3 : ComplexPattern<i32, 2, "SelectADDRriS11_3", [frameindex], []>;
+def ADDRriU6_0 : ComplexPattern<i32, 2, "SelectADDRriU6_0", [frameindex], []>;
+def ADDRriU6_1 : ComplexPattern<i32, 2, "SelectADDRriU6_1", [frameindex], []>;
+def ADDRriU6_2 : ComplexPattern<i32, 2, "SelectADDRriU6_2", [frameindex], []>;
+
+// Address operands.
+def MEMrr : Operand<i32> {
+  let PrintMethod = "printHexagonMEMrrOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+
+// Address operands
+def MEMri : Operand<i32> {
+  let PrintMethod = "printHexagonMEMriOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+
+def MEMri_s11_2 : Operand<i32>,
+  ComplexPattern<i32, 2, "SelectMEMriS11_2", []> {
+  let PrintMethod = "printHexagonMEMriOperand";
+  let MIOperandInfo = (ops IntRegs, s11Imm);
+}
+
+def FrameIndex : Operand<i32> {
+  let PrintMethod = "printHexagonFrameIndexOperand";
+  let MIOperandInfo = (ops IntRegs, s11Imm);
+}
+
+let PrintMethod = "printGlobalOperand" in
+  def globaladdress : Operand<i32>;
+
+let PrintMethod = "printJumpTable" in
+ def jumptablebase : Operand<i32>;
+
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+def bblabel : Operand<i32>;
+def bbl   : SDNode<"ISD::BasicBlock", SDTPtrLeaf   , [], "BasicBlockSDNode">;
+
+def symbolHi32 : Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo32 : Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
+
+// Multi-class for logical operators.
+multiclass ALU32_rr_ri<string OpcStr, SDNode OpNode> {
+  def rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+  def ri : ALU32_ri<(outs IntRegs:$dst), (ins s10Imm:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "(#$b, $c)")),
+                 [(set IntRegs:$dst, (OpNode s10Imm:$b, IntRegs:$c))]>;
+}
+
+// Multi-class for compare ops.
+let isCompare = 1 in {
+multiclass CMP64_rr<string OpcStr, PatFrag OpNode> {
+  def rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst, (OpNode DoubleRegs:$b, DoubleRegs:$c))]>;
+}
+multiclass CMP32_rr<string OpcStr, PatFrag OpNode> {
+  def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+}
+
+multiclass CMP32_rr_ri_s10<string OpcStr, PatFrag OpNode> {
+  def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s10Imm:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                 [(set PredRegs:$dst, (OpNode IntRegs:$b, s10ImmPred:$c))]>;
+}
+
+multiclass CMP32_rr_ri_u9<string OpcStr, PatFrag OpNode> {
+  def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                 [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>;
+}
+
+multiclass CMP32_ri_u9<string OpcStr, PatFrag OpNode> {
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                 [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>;
+}
+
+multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> {
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s8Imm:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                 [(set PredRegs:$dst, (OpNode IntRegs:$b, s8ImmPred:$c))]>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// http://qualnet.qualcomm.com/~erich/v1/htmldocs/index.html
+// http://qualnet.qualcomm.com/~erich/v2/htmldocs/index.html
+// http://qualnet.qualcomm.com/~erich/v3/htmldocs/index.html
+// http://qualnet.qualcomm.com/~erich/v4/htmldocs/index.html
+// http://qualnet.qualcomm.com/~erich/v5/htmldocs/index.html
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+let isPredicable = 1 in
+def ADD_rr : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = add($src1, $src2)",
+            [(set IntRegs:$dst, (add IntRegs:$src1, IntRegs:$src2))]>;
+
+let isPredicable = 1 in
+def ADD_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s16Imm:$src2),
+            "$dst = add($src1, #$src2)",
+            [(set IntRegs:$dst, (add IntRegs:$src1, s16ImmPred:$src2))]>;
+
+// Logical operations.
+let isPredicable = 1 in
+def XOR_rr : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = xor($src1, $src2)",
+            [(set IntRegs:$dst, (xor IntRegs:$src1, IntRegs:$src2))]>;
+
+let isPredicable = 1 in
+def AND_rr : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = and($src1, $src2)",
+            [(set IntRegs:$dst, (and IntRegs:$src1, IntRegs:$src2))]>;
+
+def OR_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s8Imm:$src2),
+            "$dst = or($src1, #$src2)",
+            [(set IntRegs:$dst, (or IntRegs:$src1, s8ImmPred:$src2))]>;
+
+def NOT_rr : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1),
+            "$dst = not($src1)",
+            [(set IntRegs:$dst, (not IntRegs:$src1))]>;
+
+def AND_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s10Imm:$src2),
+            "$dst = and($src1, #$src2)",
+            [(set IntRegs:$dst, (and IntRegs:$src1, s10ImmPred:$src2))]>;
+
+let isPredicable = 1 in
+def OR_rr : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = or($src1, $src2)",
+            [(set IntRegs:$dst, (or IntRegs:$src1, IntRegs:$src2))]>;
+
+// Negate.
+def NEG : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+          "$dst = neg($src1)",
+          [(set IntRegs:$dst, (ineg IntRegs:$src1))]>;
+// Nop.
+let neverHasSideEffects = 1 in
+def NOP : ALU32_rr<(outs), (ins),
+          "nop",
+          []>;
+
+// Subtract.
+let isPredicable = 1 in
+def SUB_rr : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sub($src1, $src2)",
+            [(set IntRegs:$dst, (sub IntRegs:$src1, IntRegs:$src2))]>;
+
+// Transfer immediate.
+let isReMaterializable = 1, isPredicable = 1 in
+def TFRI : ALU32_ri<(outs IntRegs:$dst), (ins s16Imm:$src1),
+           "$dst = #$src1",
+           [(set IntRegs:$dst, s16ImmPred:$src1)]>;
+
+// Transfer register.
+let neverHasSideEffects = 1, isPredicable = 1 in
+def TFR : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1),
+          "$dst = $src1",
+          []>;
+
+// Transfer control register.
+let neverHasSideEffects = 1 in
+def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
+           "$dst = $src1",
+           []>;
+//===----------------------------------------------------------------------===//
+// ALU32/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine.
+let isPredicable = 1, neverHasSideEffects = 1 in
+def COMBINE_rr : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = combine($src1, $src2)",
+            []>;
+
+// Mux.
+def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                   DoubleRegs:$src2,
+                                                   DoubleRegs:$src3),
+            "$dst = vmux($src1, $src2, $src3)",
+            []>;
+
+def MUX_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                            IntRegs:$src2, IntRegs:$src3),
+             "$dst = mux($src1, $src2, $src3)",
+             [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+def MUX_ir : ALU32_ir<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2,
+                                                IntRegs:$src3),
+             "$dst = mux($src1, #$src2, $src3)",
+             [(set IntRegs:$dst, (select PredRegs:$src1,
+                                         s8ImmPred:$src2, IntRegs:$src3))]>;
+
+def MUX_ri : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2,
+                                                s8Imm:$src3),
+             "$dst = mux($src1, $src2, #$src3)",
+             [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
+                                         s8ImmPred:$src3))]>;
+
+def MUX_ii : ALU32_ii<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2,
+                                                s8Imm:$src3),
+             "$dst = mux($src1, #$src2, #$src3)",
+             [(set IntRegs:$dst, (select PredRegs:$src1, s8ImmPred:$src2,
+                                         s8ImmPred:$src3))]>;
+
+// Shift halfword.
+let isPredicable = 1 in
+def ASLH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+           "$dst = aslh($src1)",
+           [(set IntRegs:$dst, (shl 16, IntRegs:$src1))]>;
+
+let isPredicable = 1 in
+def ASRH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+           "$dst = asrh($src1)",
+           [(set IntRegs:$dst, (sra 16, IntRegs:$src1))]>;
+
+// Sign extend.
+let isPredicable = 1 in
+def SXTB : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+           "$dst = sxtb($src1)",
+           [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i8))]>;
+
+let isPredicable = 1 in
+def SXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+           "$dst = sxth($src1)",
+           [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i16))]>;
+
+// Zero extend.
+let isPredicable = 1, neverHasSideEffects = 1 in
+def ZXTB : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+           "$dst = zxtb($src1)",
+           []>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def ZXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+                    "$dst = zxth($src1)",
+                    []>;
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED +
+//===----------------------------------------------------------------------===//
+
+// Conditional add.
+let neverHasSideEffects = 1 in
+def ADD_ri_cPt : ALU32_ri<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            "if ($src1) $dst = add($src2, #$src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def ADD_ri_cNotPt : ALU32_ri<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            "if (!$src1) $dst = add($src2, #$src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def ADD_ri_cdnPt : ALU32_ri<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            "if ($src1.new) $dst = add($src2, #$src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def ADD_ri_cdnNotPt : ALU32_ri<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            "if (!$src1.new) $dst = add($src2, #$src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def ADD_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1) $dst = add($src2, $src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def ADD_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1) $dst = add($src2, $src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def ADD_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1.new) $dst = add($src2, $src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def ADD_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1.new) $dst = add($src2, $src3)",
+            []>;
+
+
+// Conditional combine.
+
+let neverHasSideEffects = 1 in
+def COMBINE_rr_cPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1) $dst = combine($src2, $src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def COMBINE_rr_cNotPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1) $dst = combine($src2, $src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def COMBINE_rr_cdnPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1.new) $dst = combine($src2, $src3)",
+            []>;
+
+let neverHasSideEffects = 1 in
+def COMBINE_rr_cdnNotPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1.new) $dst = combine($src2, $src3)",
+            []>;
+
+// Conditional logical operations.
+
+def XOR_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1) $dst = xor($src2, $src3)",
+            []>;
+
+def XOR_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1) $dst = xor($src2, $src3)",
+            []>;
+
+def XOR_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1.new) $dst = xor($src2, $src3)",
+            []>;
+
+def XOR_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1.new) $dst = xor($src2, $src3)",
+            []>;
+
+def AND_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1) $dst = and($src2, $src3)",
+            []>;
+
+def AND_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1) $dst = and($src2, $src3)",
+            []>;
+
+def AND_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1.new) $dst = and($src2, $src3)",
+            []>;
+
+def AND_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1.new) $dst = and($src2, $src3)",
+            []>;
+
+def OR_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1) $dst = or($src2, $src3)",
+            []>;
+
+def OR_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1) $dst = or($src2, $src3)",
+            []>;
+
+def OR_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1.new) $dst = or($src2, $src3)",
+            []>;
+
+def OR_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1.new) $dst = or($src2, $src3)",
+            []>;
+
+
+// Conditional subtract.
+
+def SUB_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1) $dst = sub($src2, $src3)",
+            []>;
+
+def SUB_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1) $dst = sub($src2, $src3)",
+            []>;
+
+def SUB_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1.new) $dst = sub($src2, $src3)",
+            []>;
+
+def SUB_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1.new) $dst = sub($src2, $src3)",
+            []>;
+
+
+// Conditional transfer.
+
+let neverHasSideEffects = 1 in
+def TFR_cPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2),
+              "if ($src1) $dst = $src2",
+              []>;
+
+let neverHasSideEffects = 1 in
+def TFR_cNotPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                    IntRegs:$src2),
+                 "if (!$src1) $dst = $src2",
+                 []>;
+
+let neverHasSideEffects = 1 in
+def TFRI_cPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, s12Imm:$src2),
+               "if ($src1) $dst = #$src2",
+               []>;
+
+let neverHasSideEffects = 1 in
+def TFRI_cNotPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                     s12Imm:$src2),
+                  "if (!$src1) $dst = #$src2",
+                  []>;
+
+let neverHasSideEffects = 1 in
+def TFR_cdnPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                   IntRegs:$src2),
+                "if ($src1.new) $dst = $src2",
+                []>;
+
+let neverHasSideEffects = 1 in
+def TFR_cdnNotPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                      IntRegs:$src2),
+                   "if (!$src1.new) $dst = $src2",
+                   []>;
+
+let neverHasSideEffects = 1 in
+def TFRI_cdnPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                    s12Imm:$src2),
+                 "if ($src1.new) $dst = #$src2",
+                 []>;
+
+let neverHasSideEffects = 1 in
+def TFRI_cdnNotPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                       s12Imm:$src2),
+                    "if (!$src1.new) $dst = #$src2",
+                    []>;
+
+// Compare.
+defm CMPGTU : CMP32_rr_ri_u9<"cmp.gtu", setugt>;
+defm CMPGT : CMP32_rr_ri_s10<"cmp.gt", setgt>;
+defm CMPLT : CMP32_rr<"cmp.lt", setlt>;
+defm CMPEQ : CMP32_rr_ri_s10<"cmp.eq", seteq>;
+defm CMPGE : CMP32_ri_s8<"cmp.ge", setge>;
+defm CMPGEU : CMP32_ri_u9<"cmp.geu", setuge>;
+//===----------------------------------------------------------------------===//
+// ALU32/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU32/VH +
+//===----------------------------------------------------------------------===//
+// Vector add halfwords
+
+// Vector averagehalfwords
+
+// Vector subtract halfwords
+//===----------------------------------------------------------------------===//
+// ALU32/VH -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+def ADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = add($src1, $src2)",
+               [(set DoubleRegs:$dst, (add DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+// Add halfword.
+
+// Compare.
+defm CMPEHexagon4 : CMP64_rr<"cmp.eq", seteq>;
+defm CMPGT64 : CMP64_rr<"cmp.gt", setgt>;
+defm CMPGTU64 : CMP64_rr<"cmp.gtu", setugt>;
+
+// Logical operations.
+def AND_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = and($src1, $src2)",
+               [(set DoubleRegs:$dst, (and DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+def OR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = or($src1, $src2)",
+              [(set DoubleRegs:$dst, (or DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+def XOR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = xor($src1, $src2)",
+               [(set DoubleRegs:$dst, (xor DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+// Maximum.
+def MAXw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set IntRegs:$dst, (select (i1 (setlt IntRegs:$src2,
+                                                     IntRegs:$src1)),
+                                          IntRegs:$src1, IntRegs:$src2))]>;
+
+// Minimum.
+def MINw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set IntRegs:$dst, (select (i1 (setgt IntRegs:$src2,
+                                                     IntRegs:$src1)),
+                                          IntRegs:$src1, IntRegs:$src2))]>;
+
+// Subtract.
+def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = sub($src1, $src2)",
+               [(set DoubleRegs:$dst, (sub DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+// Subtract halfword.
+
+// Transfer register.
+let neverHasSideEffects = 1 in
+def TFR_64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             "$dst = $src1",
+             []>;
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/BIT +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/PERM +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/VB +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/VB -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/VH +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/VH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/VW +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/VW -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+// Logical reductions on predicates.
+
+// Looping instructions.
+
+// Pipelined looping instructions.
+
+// Logical operations on predicates.
+def AND_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+             "$dst = and($src1, $src2)",
+             [(set PredRegs:$dst, (and PredRegs:$src1, PredRegs:$src2))]>;
+
+let neverHasSideEffects = 1 in
+def AND_pnotp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1,
+                                                 PredRegs:$src2),
+                "$dst = and($src1, !$src2)",
+                []>;
+
+def NOT_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = not($src1)",
+             [(set PredRegs:$dst, (not PredRegs:$src1))]>;
+
+def ANY_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = any8($src1)",
+             []>;
+
+def ALL_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = all8($src1)",
+             []>;
+
+def VITPACK_pp : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                 PredRegs:$src2),
+             "$dst = vitpack($src1, $src2)",
+             []>;
+
+def VALIGN_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2,
+                                                    PredRegs:$src3),
+             "$dst = valignb($src1, $src2, $src3)",
+             []>;
+
+def VSPLICE_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2,
+                                                     PredRegs:$src3),
+             "$dst = vspliceb($src1, $src2, $src3)",
+             []>;
+
+def MASK_p : SInst<(outs DoubleRegs:$dst), (ins PredRegs:$src1),
+             "$dst = mask($src1)",
+             []>;
+
+def NOT_Ps : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = not($src1)",
+             [(set PredRegs:$dst, (not PredRegs:$src1))]>;
+
+def OR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+            "$dst = or($src1, $src2)",
+            [(set PredRegs:$dst, (or PredRegs:$src1, PredRegs:$src2))]>;
+
+def XOR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+             "$dst = xor($src1, $src2)",
+             [(set PredRegs:$dst, (xor PredRegs:$src1, PredRegs:$src2))]>;
+
+
+// User control register transfer.
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// J +
+//===----------------------------------------------------------------------===//
+// Jump to address.
+let isBranch = 1, isTerminator=1, isBarrier = 1, isPredicable = 1 in {
+  def JMP : JInst< (outs),
+            (ins brtarget:$offset),
+            "jump $offset",
+            [(br bb:$offset)]>;
+}
+
+// if (p0) jump
+let isBranch = 1, isTerminator=1, Defs = [PC] in {
+  def JMP_Pred : JInst< (outs),
+                 (ins PredRegs:$src, brtarget:$offset),
+                 "if ($src) jump $offset",
+                 [(brcond PredRegs:$src, bb:$offset)]>;
+}
+
+// if (!p0) jump
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
+  def JMP_PredNot : JInst< (outs),
+                    (ins PredRegs:$src, brtarget:$offset),
+                    "if (!$src) jump $offset",
+                    []>;
+}
+
+let isTerminator = 1, isBranch = 1, neverHasSideEffects = 1, Defs = [PC] in {
+  def BRCOND : JInst < (outs), (ins PredRegs:$pred, brtarget:$dst),
+               "if ($pred) jump $dst",
+               []>;
+}
+
+// Jump to address conditioned on new predicate.
+// if (p0) jump:t
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
+  def JMP_PredPt : JInst< (outs),
+                   (ins PredRegs:$src, brtarget:$offset),
+                   "if ($src.new) jump:t $offset",
+                   []>;
+}
+
+// if (!p0) jump:t
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
+  def JMP_PredNotPt : JInst< (outs),
+                      (ins PredRegs:$src, brtarget:$offset),
+                      "if (!$src.new) jump:t $offset",
+                      []>;
+}
+
+// Not taken.
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
+  def JMP_PredPnt : JInst< (outs),
+                    (ins PredRegs:$src, brtarget:$offset),
+                    "if ($src.new) jump:nt $offset",
+                    []>;
+}
+
+// Not taken.
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
+  def JMP_PredNotPnt : JInst< (outs),
+                       (ins PredRegs:$src, brtarget:$offset),
+                       "if (!$src.new) jump:nt $offset",
+                       []>;
+}
+//===----------------------------------------------------------------------===//
+// J -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
+                               [SDNPHasChain, SDNPOptInGlue]>;
+
+// Jump to address from register.
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR: JRInst<(outs), (ins),
+                   "jumpr r31",
+                   [(retflag)]>;
+}
+
+// Jump to address from register.
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cPt: JRInst<(outs), (ins PredRegs:$src1),
+                       "if ($src1) jumpr r31",
+                       []>;
+}
+
+// Jump to address from register.
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cNotPt: JRInst<(outs), (ins PredRegs:$src1),
+                          "if (!$src1) jumpr r31",
+                          []>;
+}
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+///
+/// Make sure that in post increment load, the first operand is always the post
+/// increment operand.
+///
+// Load doubleword.
+let isPredicable = 1 in
+def LDrid : LDInst<(outs DoubleRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memd($addr)",
+            [(set DoubleRegs:$dst, (load ADDRriS11_3:$addr))]>;
+
+let isPredicable = 1, AddedComplexity = 20 in
+def LDrid_indexed : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s11_3Imm:$offset),
+            "$dst=memd($src1+#$offset)",
+            [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
+                                              s11_3ImmPred:$offset)))]>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_GP : LDInst<(outs DoubleRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memd(#$global+$offset)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDd_GP : LDInst<(outs DoubleRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memd(#$global)",
+            []>;
+
+let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrid : LDInstPI<(outs DoubleRegs:$dst, IntRegs:$dst2),
+            (ins IntRegs:$src1, s4Imm:$offset),
+            "$dst = memd($src1++#$offset)",
+            [],
+            "$src1 = $dst2">;
+
+// Load doubleword conditionally.
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_cPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1) $dst = memd($addr)",
+            []>;
+
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_cNotPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1) $dst = memd($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_indexed_cPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
+            "if ($src1) $dst=memd($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_indexed_cNotPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
+            "if (!$src1) $dst=memd($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrid_cPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
+            "if ($src1) $dst1 = memd($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrid_cNotPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
+            "if (!$src1) $dst1 = memd($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_cdnPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1.new) $dst = memd($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_cdnNotPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1.new) $dst = memd($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_indexed_cdnPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
+            "if ($src1.new) $dst=memd($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrid_indexed_cdnNotPt : LDInst<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
+            "if (!$src1.new) $dst=memd($src2+#$src3)",
+            []>;
+
+
+// Load byte.
+let isPredicable = 1 in
+def LDrib : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memb($addr)",
+            [(set IntRegs:$dst, (sextloadi8 ADDRriS11_0:$addr))]>;
+
+def LDrib_ae : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memb($addr)",
+            [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>;
+
+// Indexed load byte.
+let isPredicable = 1, AddedComplexity = 20 in
+def LDrib_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_0Imm:$offset),
+            "$dst=memb($src1+#$offset)",
+            [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1,
+                                                 s11_0ImmPred:$offset)))]>;
+
+
+// Indexed load byte any-extend.
+let AddedComplexity = 20 in
+def LDrib_ae_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_0Imm:$offset),
+            "$dst=memb($src1+#$offset)",
+            [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
+                                                s11_0ImmPred:$offset)))]>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memb(#$global+$offset)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDb_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memb(#$global)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDub_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memub(#$global)",
+            []>;
+
+let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrib : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+            (ins IntRegs:$src1, s4Imm:$offset),
+            "$dst = memb($src1++#$offset)",
+            [],
+            "$src1 = $dst2">;
+
+// Load byte conditionally.
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1) $dst = memb($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1) $dst = memb($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_indexed_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if ($src1) $dst = memb($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if (!$src1) $dst = memb($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrib_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if ($src1) $dst1 = memb($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrib_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if (!$src1) $dst1 = memb($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1.new) $dst = memb($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1.new) $dst = memb($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if ($src1.new) $dst = memb($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrib_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if (!$src1.new) $dst = memb($src2+#$src3)",
+            []>;
+
+
+// Load halfword.
+let isPredicable = 1 in
+def LDrih : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memh($addr)",
+            [(set IntRegs:$dst, (sextloadi16 ADDRriS11_1:$addr))]>;
+
+let isPredicable = 1, AddedComplexity = 20 in
+def LDrih_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_1Imm:$offset),
+            "$dst=memh($src1+#$offset)",
+            [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1,
+                                                  s11_1ImmPred:$offset)))] >;
+
+def LDrih_ae : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memh($addr)",
+            [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>;
+
+let AddedComplexity = 20 in
+def LDrih_ae_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_1Imm:$offset),
+            "$dst=memh($src1+#$offset)",
+            [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
+                                                 s11_1ImmPred:$offset)))] >;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memh(#$global+$offset)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDh_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memh(#$global)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDuh_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memuh(#$global)",
+            []>;
+
+
+let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrih : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+            (ins IntRegs:$src1, s4Imm:$offset),
+            "$dst = memh($src1++#$offset)",
+            [],
+            "$src1 = $dst2">;
+
+// Load halfword conditionally.
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1) $dst = memh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1) $dst = memh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_indexed_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if ($src1) $dst = memh($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if (!$src1) $dst = memh($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrih_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if ($src1) $dst1 = memh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrih_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if (!$src1) $dst1 = memh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1.new) $dst = memh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1.new) $dst = memh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if ($src1.new) $dst = memh($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDrih_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if (!$src1.new) $dst = memh($src2+#$src3)",
+            []>;
+
+// Load unsigned byte.
+let isPredicable = 1 in
+def LDriub : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memub($addr)",
+            [(set IntRegs:$dst, (zextloadi8 ADDRriS11_0:$addr))]>;
+
+let isPredicable = 1 in
+def LDriubit : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memub($addr)",
+            [(set IntRegs:$dst, (zextloadi1 ADDRriS11_0:$addr))]>;
+
+let isPredicable = 1, AddedComplexity = 20 in
+def LDriub_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_0Imm:$offset),
+            "$dst=memub($src1+#$offset)",
+            [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1,
+                                                 s11_0ImmPred:$offset)))]>;
+
+let AddedComplexity = 20 in
+def LDriubit_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_0Imm:$offset),
+            "$dst=memub($src1+#$offset)",
+            [(set IntRegs:$dst, (zextloadi1 (add IntRegs:$src1,
+                                                 s11_0ImmPred:$offset)))]>;
+
+def LDriub_ae : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memub($addr)",
+            [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>;
+
+
+let AddedComplexity = 20 in
+def LDriub_ae_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_0Imm:$offset),
+            "$dst=memub($src1+#$offset)",
+            [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
+                                                s11_0ImmPred:$offset)))]>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memub(#$global+$offset)",
+            []>;
+
+let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriub : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+            (ins IntRegs:$src1, s4Imm:$offset),
+            "$dst = memub($src1++#$offset)",
+            [],
+            "$src1 = $dst2">;
+
+// Load unsigned byte conditionally.
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1) $dst = memub($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1) $dst = memub($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_indexed_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if ($src1) $dst = memub($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if (!$src1) $dst = memub($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriub_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if ($src1) $dst1 = memub($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriub_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if (!$src1) $dst1 = memub($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1.new) $dst = memub($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1.new) $dst = memub($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if ($src1.new) $dst = memub($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriub_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
+            "if (!$src1.new) $dst = memub($src2+#$src3)",
+            []>;
+
+// Load unsigned halfword.
+let isPredicable = 1 in
+def LDriuh : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memuh($addr)",
+            [(set IntRegs:$dst, (zextloadi16 ADDRriS11_1:$addr))]>;
+
+// Indexed load unsigned halfword.
+let isPredicable = 1, AddedComplexity = 20 in
+def LDriuh_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_1Imm:$offset),
+            "$dst=memuh($src1+#$offset)",
+            [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1,
+                                                  s11_1ImmPred:$offset)))]>;
+
+def LDriuh_ae : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memuh($addr)",
+            [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>;
+
+
+// Indexed load unsigned halfword any-extend.
+let AddedComplexity = 20 in
+def LDriuh_ae_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_1Imm:$offset),
+            "$dst=memuh($src1+#$offset)",
+            [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
+                                                 s11_1ImmPred:$offset)))] >;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memuh(#$global+$offset)",
+            []>;
+
+let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriuh : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+            (ins IntRegs:$src1, s4Imm:$offset),
+            "$dst = memuh($src1++#$offset)",
+            [],
+            "$src1 = $dst2">;
+
+// Load unsigned halfword conditionally.
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1) $dst = memuh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1) $dst = memuh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_indexed_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if ($src1) $dst = memuh($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if (!$src1) $dst = memuh($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriuh_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if ($src1) $dst1 = memuh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriuh_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if (!$src1) $dst1 = memuh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1.new) $dst = memuh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1.new) $dst = memuh($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if ($src1.new) $dst = memuh($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriuh_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
+            "if (!$src1.new) $dst = memuh($src2+#$src3)",
+            []>;
+
+
+// Load word.
+let isPredicable = 1 in
+def LDriw : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr), "$dst = memw($addr)",
+            [(set IntRegs:$dst, (load ADDRriS11_2:$addr))]>;
+
+// Load predicate.
+let mayLoad = 1, Defs = [R10,R11] in
+def LDriw_pred : LDInst<(outs PredRegs:$dst),
+            (ins MEMri:$addr),
+            "Error; should not emit",
+            []>;
+
+// Indexed load.
+let isPredicable = 1, AddedComplexity = 20 in
+def LDriw_indexed : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_2Imm:$offset),
+            "$dst=memw($src1+#$offset)",
+            [(set IntRegs:$dst, (load (add IntRegs:$src1,
+                                           s11_2ImmPred:$offset)))]>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memw(#$global+$offset)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDw_GP : LDInst<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memw(#$global)",
+            []>;
+
+let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriw : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+            (ins IntRegs:$src1, s4Imm:$offset),
+            "$dst = memw($src1++#$offset)",
+            [],
+            "$src1 = $dst2">;
+
+// Load word conditionally.
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1) $dst = memw($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1) $dst = memw($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_indexed_cPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
+            "if ($src1) $dst=memw($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
+            "if (!$src1) $dst=memw($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriw_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
+            "if ($src1) $dst1 = memw($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriw_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
+            "if (!$src1) $dst1 = memw($src2++#$src3)",
+            [],
+            "$src2 = $dst2">;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if ($src1.new) $dst = memw($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, MEMri:$addr),
+            "if (!$src1.new) $dst = memw($addr)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
+            "if ($src1.new) $dst=memw($src2+#$src3)",
+            []>;
+
+let mayLoad = 1, neverHasSideEffects = 1 in
+def LDriw_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
+            "if (!$src1.new) $dst=memw($src2+#$src3)",
+            []>;
+
+// Deallocate stack frame.
+let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
+  def DEALLOCFRAME : LDInst<(outs), (ins i32imm:$amt1),
+                     "deallocframe",
+                     []>;
+}
+
+// Load and unpack bytes to halfwords.
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH +
+//===----------------------------------------------------------------------===//
+// Multiply and use lower result.
+// Rd=+mpyi(Rs,#u8)
+def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
+              "$dst =+ mpyi($src1, #$src2)",
+              [(set IntRegs:$dst, (mul IntRegs:$src1, u8ImmPred:$src2))]>;
+
+// Rd=-mpyi(Rs,#u8)
+def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2),
+              "$dst =- mpyi($src1, #$src2)",
+              [(set IntRegs:$dst,
+               (mul IntRegs:$src1, n8ImmPred:$src2))]>;
+
+// Rd=mpyi(Rs,#m9)
+// s9 is NOT the same as m9 - but it works.. so far.
+// Assembler maps to either Rd=+mpyi(Rs,#u8 or Rd=-mpyi(Rs,#u8)
+// depending on the value of m9. See Arch Spec.
+def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
+              "$dst = mpyi($src1, #$src2)",
+              [(set IntRegs:$dst, (mul IntRegs:$src1, s9ImmPred:$src2))]>;
+
+// Rd=mpyi(Rs,Rt)
+def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+           "$dst = mpyi($src1, $src2)",
+           [(set IntRegs:$dst, (mul IntRegs:$src1, IntRegs:$src2))]>;
+
+// Rx+=mpyi(Rs,#u8)
+def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
+            "$dst += mpyi($src2, #$src3)",
+            [(set IntRegs:$dst,
+            (add (mul IntRegs:$src2, u8ImmPred:$src3), IntRegs:$src1))],
+            "$src1 = $dst">;
+
+// Rx+=mpyi(Rs,Rt)
+def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst += mpyi($src2, $src3)",
+            [(set IntRegs:$dst,
+            (add (mul IntRegs:$src2, IntRegs:$src3), IntRegs:$src1))],
+            "$src1 = $dst">;
+
+// Rx-=mpyi(Rs,#u8)
+def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
+            "$dst -= mpyi($src2, #$src3)",
+            [(set IntRegs:$dst,
+            (sub IntRegs:$src1, (mul IntRegs:$src2, u8ImmPred:$src3)))],
+            "$src1 = $dst">;
+
+// Multiply and use upper result.
+// Rd=mpy(Rs,Rt.H):<<1:rnd:sat
+// Rd=mpy(Rs,Rt.L):<<1:rnd:sat
+// Rd=mpy(Rs,Rt)
+def MPY : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          "$dst = mpy($src1, $src2)",
+          [(set IntRegs:$dst, (mulhs IntRegs:$src1, IntRegs:$src2))]>;
+
+// Rd=mpy(Rs,Rt):rnd
+// Rd=mpyu(Rs,Rt)
+def MPYU : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+           "$dst = mpyu($src1, $src2)",
+           [(set IntRegs:$dst, (mulhu IntRegs:$src1, IntRegs:$src2))]>;
+
+// Multiply and use full result.
+// Rdd=mpyu(Rs,Rt)
+def MPYU64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = mpyu($src1, $src2)",
+             [(set DoubleRegs:$dst, (mul (i64 (anyext IntRegs:$src1)),
+              (i64 (anyext IntRegs:$src2))))]>;
+
+// Rdd=mpy(Rs,Rt)
+def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = mpy($src1, $src2)",
+             [(set DoubleRegs:$dst, (mul (i64 (sext IntRegs:$src1)),
+              (i64 (sext IntRegs:$src2))))]>;
+
+
+// Multiply and accumulate, use full result.
+// Rxx[+-]=mpy(Rs,Rt)
+// Rxx+=mpy(Rs,Rt)
+def MPY64_acc : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst += mpy($src2, $src3)",
+            [(set DoubleRegs:$dst,
+            (add (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3))),
+               DoubleRegs:$src1))],
+            "$src1 = $dst">;
+
+// Rxx-=mpy(Rs,Rt)
+def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst -= mpy($src2, $src3)",
+            [(set DoubleRegs:$dst,
+            (sub DoubleRegs:$src1,
+                (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3)))))],
+            "$src1 = $dst">;
+
+// Rxx[+-]=mpyu(Rs,Rt)
+// Rxx+=mpyu(Rs,Rt)
+def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst += mpyu($src2, $src3)",
+             [(set DoubleRegs:$dst, (add (mul (i64 (anyext IntRegs:$src2)),
+              (i64 (anyext IntRegs:$src3))),
+               DoubleRegs:$src1))],"$src1 = $dst">;
+
+// Rxx-=mpyu(Rs,Rt)
+def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst += mpyu($src2, $src3)",
+            [(set DoubleRegs:$dst,
+            (sub DoubleRegs:$src1,
+                    (mul (i64 (anyext IntRegs:$src2)),
+                         (i64 (anyext IntRegs:$src3)))))],
+            "$src1 = $dst">;
+
+
+def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst += add($src2, $src3)",
+             [(set IntRegs:$dst, (add (add IntRegs:$src2, IntRegs:$src3),
+                                      IntRegs:$src1))],
+             "$src1 = $dst">;
+
+def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, s8Imm:$src3),
+             "$dst += add($src2, #$src3)",
+             [(set IntRegs:$dst, (add (add IntRegs:$src2, s8ImmPred:$src3),
+                                      IntRegs:$src1))],
+             "$src1 = $dst">;
+
+def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst -= add($src2, $src3)",
+             [(set IntRegs:$dst, (sub IntRegs:$src1, (add IntRegs:$src2,
+                                                     IntRegs:$src3)))],
+             "$src1 = $dst">;
+
+def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, s8Imm:$src3),
+             "$dst -= add($src2, #$src3)",
+             [(set IntRegs:$dst, (sub IntRegs:$src1,
+                                      (add IntRegs:$src2, s8ImmPred:$src3)))],
+             "$src1 = $dst">;
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VB +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VB -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+/// Assumptions::: ****** DO NOT IGNORE ********
+/// 1. Make sure that in post increment store, the zero'th operand is always the
+///    post increment operand.
+/// 2. Make sure that the store value operand(Rt/Rtt) in a store is always the
+///    last operand.
+///
+// Store doubleword.
+let isPredicable = 1 in
+def STrid : STInst<(outs),
+            (ins MEMri:$addr, DoubleRegs:$src1),
+            "memd($addr) = $src1",
+            [(store DoubleRegs:$src1, ADDRriS11_3:$addr)]>;
+
+// Indexed store double word.
+let AddedComplexity = 10, isPredicable = 1 in
+def STrid_indexed : STInst<(outs),
+            (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
+            "memd($src1+#$src2) = $src3",
+            [(store DoubleRegs:$src3,
+                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrid_GP : STInst<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src),
+            "memd(#$global+$offset) = $src",
+            []>;
+
+let hasCtrlDep = 1, isPredicable = 1 in
+def POST_STdri : STInstPI<(outs IntRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, s4Imm:$offset),
+            "memd($src2++#$offset) = $src1",
+            [(set IntRegs:$dst,
+            (post_store DoubleRegs:$src1, IntRegs:$src2, s4_3ImmPred:$offset))],
+            "$src2 = $dst">;
+
+// Store doubleword conditionally.
+// if ([!]Pv) memd(Rs+#u6:3)=Rtt
+// if (Pv) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_cPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
+            "if ($src1) memd($addr) = $src2",
+            []>;
+
+// if (!Pv) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
+            "if (!$src1) memd($addr) = $src2",
+            []>;
+
+// if (Pv) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_cPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
+                 DoubleRegs:$src4),
+            "if ($src1) memd($src2+#$src3) = $src4",
+            []>;
+
+// if (!Pv) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
+                 DoubleRegs:$src4),
+            "if (!$src1) memd($src2+#$src3) = $src4",
+            []>;
+
+// if ([!]Pv) memd(Rx++#s4:3)=Rtt
+// if (Pv) memd(Rx++#s4:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def POST_STdri_cPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
+                 s4_3Imm:$offset),
+            "if ($src1) memd($src3++#$offset) = $src2",
+            [],
+            "$src3 = $dst">;
+
+// if (!Pv) memd(Rx++#s4:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def POST_STdri_cNotPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
+                 s4_3Imm:$offset),
+            "if (!$src1) memd($src3++#$offset) = $src2",
+            [],
+            "$src3 = $dst">;
+
+
+// Store byte.
+// memb(Rs+#s11:0)=Rt
+let isPredicable = 1 in
+def STrib : STInst<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memb($addr) = $src1",
+            [(truncstorei8 IntRegs:$src1, ADDRriS11_0:$addr)]>;
+
+let AddedComplexity = 10, isPredicable = 1 in
+def STrib_indexed : STInst<(outs),
+            (ins IntRegs:$src1, s11_0Imm:$src2, IntRegs:$src3),
+            "memb($src1+#$src2) = $src3",
+            [(truncstorei8 IntRegs:$src3, (add IntRegs:$src1,
+                                               s11_0ImmPred:$src2))]>;
+
+// memb(gp+#u16:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP : STInst<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memb(#$global+$offset) = $src",
+            []>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP   : STInst<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memb(#$global) = $src",
+            []>;
+
+// memb(Rx++#s4:0)=Rt
+let hasCtrlDep = 1, isPredicable = 1 in
+def POST_STbri : STInstPI<(outs IntRegs:$dst), (ins IntRegs:$src1,
+                                                    IntRegs:$src2,
+                                                    s4Imm:$offset),
+            "memb($src2++#$offset) = $src1",
+            [(set IntRegs:$dst,
+            (post_truncsti8 IntRegs:$src1, IntRegs:$src2,
+                            s4_0ImmPred:$offset))],
+            "$src2 = $dst">;
+
+// Store byte conditionally.
+// if ([!]Pv) memb(Rs+#u6:0)=Rt
+// if (Pv) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1) memb($addr) = $src2",
+            []>;
+
+// if (!Pv) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1) memb($addr) = $src2",
+            []>;
+
+// if (Pv) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if ($src1) memb($src2+#$src3) = $src4",
+            []>;
+
+// if (!Pv) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if (!$src1) memb($src2+#$src3) = $src4",
+            []>;
+
+// if ([!]Pv) memb(Rx++#s4:0)=Rt
+// if (Pv) memb(Rx++#s4:0)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if ($src1) memb($src3++#$offset) = $src2",
+            [],"$src3 = $dst">;
+
+// if (!Pv) memb(Rx++#s4:0)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cNotPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if (!$src1) memb($src3++#$offset) = $src2",
+            [],"$src3 = $dst">;
+
+
+// Store halfword.
+// memh(Rs+#s11:1)=Rt
+let isPredicable = 1 in
+def STrih : STInst<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memh($addr) = $src1",
+            [(truncstorei16 IntRegs:$src1, ADDRriS11_1:$addr)]>;
+
+
+let AddedComplexity = 10, isPredicable = 1 in
+def STrih_indexed : STInst<(outs),
+            (ins IntRegs:$src1, s11_1Imm:$src2,  IntRegs:$src3),
+            "memh($src1+#$src2) = $src3",
+            [(truncstorei16 IntRegs:$src3, (add IntRegs:$src1,
+                                                s11_1ImmPred:$src2))]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP : STInst<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memh(#$global+$offset) = $src",
+            []>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP   : STInst<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memh(#$global) = $src",
+            []>;
+
+// memh(Rx++#s4:1)=Rt.H
+// memh(Rx++#s4:1)=Rt
+let hasCtrlDep = 1, isPredicable = 1 in
+def POST_SThri : STInstPI<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
+            "memh($src2++#$offset) = $src1",
+            [(set IntRegs:$dst,
+            (post_truncsti16 IntRegs:$src1, IntRegs:$src2,
+                             s4_1ImmPred:$offset))],
+            "$src2 = $dst">;
+
+// Store halfword conditionally.
+// if ([!]Pv) memh(Rs+#u6:1)=Rt
+// if (Pv) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1) memh($addr) = $src2",
+            []>;
+
+// if (!Pv) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1) memh($addr) = $src2",
+            []>;
+
+// if (Pv) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if ($src1) memh($src2+#$src3) = $src4",
+            []>;
+
+// if (!Pv) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if (!$src1) memh($src2+#$src3) = $src4",
+            []>;
+
+// if ([!]Pv) memh(Rx++#s4:1)=Rt
+// if (Pv) memh(Rx++#s4:1)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if ($src1) memh($src3++#$offset) = $src2",
+            [],"$src3 = $dst">;
+
+// if (!Pv) memh(Rx++#s4:1)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cNotPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if (!$src1) memh($src3++#$offset) = $src2",
+            [],"$src3 = $dst">;
+
+
+// Store word.
+// Store predicate.
+let Defs = [R10,R11] in
+def STriw_pred : STInst<(outs),
+            (ins MEMri:$addr, PredRegs:$src1),
+            "Error; should not emit",
+            []>;
+
+// memw(Rs+#s11:2)=Rt
+let isPredicable = 1 in
+def STriw : STInst<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memw($addr) = $src1",
+            [(store IntRegs:$src1, ADDRriS11_2:$addr)]>;
+
+let AddedComplexity = 10, isPredicable = 1 in
+def STriw_indexed : STInst<(outs),
+            (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
+            "memw($src1+#$src2) = $src3",
+            [(store IntRegs:$src3, (add IntRegs:$src1, s11_2ImmPred:$src2))]>;
+
+def STriwt : STInst<(outs),
+            (ins MEMri:$addr, DoubleRegs:$src1),
+            "memw($addr) = $src1",
+            [(truncstorei32 DoubleRegs:$src1, ADDRriS11_2:$addr)]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP : STInst<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memw(#$global+$offset) = $src",
+            []>;
+
+let hasCtrlDep = 1, isPredicable = 1  in
+def POST_STwri : STInstPI<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
+            "memw($src2++#$offset) = $src1",
+            [(set IntRegs:$dst,
+            (post_store IntRegs:$src1, IntRegs:$src2, s4_2ImmPred:$offset))],
+            "$src2 = $dst">;
+
+// Store word conditionally.
+// if ([!]Pv) memw(Rs+#u6:2)=Rt
+// if (Pv) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1) memw($addr) = $src2",
+            []>;
+
+// if (!Pv) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1) memw($addr) = $src2",
+            []>;
+
+// if (Pv) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if ($src1) memw($src2+#$src3) = $src4",
+            []>;
+
+// if (!Pv) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cNotPt : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if (!$src1) memw($src2+#$src3) = $src4",
+            []>;
+
+// if ([!]Pv) memw(Rx++#s4:2)=Rt
+// if (Pv) memw(Rx++#s4:2)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if ($src1) memw($src3++#$offset) = $src2",
+            [],"$src3 = $dst">;
+
+// if (!Pv) memw(Rx++#s4:2)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cNotPt : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if (!$src1) memw($src3++#$offset) = $src2",
+            [],"$src3 = $dst">;
+
+
+
+// Allocate stack frame.
+let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
+  def ALLOCFRAME : STInst<(outs),
+             (ins i32imm:$amt),
+             "allocframe(#$amt)",
+             []>;
+}
+//===----------------------------------------------------------------------===//
+// ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/ALU +
+//===----------------------------------------------------------------------===//
+// Logical NOT.
+def NOT_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+               "$dst = not($src1)",
+               [(set DoubleRegs:$dst, (not DoubleRegs:$src1))]>;
+
+
+// Sign extend word to doubleword.
+def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+           "$dst = sxtw($src1)",
+           [(set DoubleRegs:$dst, (sext IntRegs:$src1))]>;
+//===----------------------------------------------------------------------===//
+// STYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/BIT -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// STYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/PERM +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/PRED +
+//===----------------------------------------------------------------------===//
+// Predicate transfer.
+let neverHasSideEffects = 1 in
+def TFR_RsPd : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1),
+               "$dst = $src1  // Should almost never emit this",
+               []>;
+
+def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
+               "$dst = $src1  // Should almost never emit!",
+               [(set PredRegs:$dst, (trunc IntRegs:$src1))]>;
+//===----------------------------------------------------------------------===//
+// STYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+// Shift by immediate.
+def ASR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             "$dst = asr($src1, #$src2)",
+             [(set IntRegs:$dst, (sra IntRegs:$src1, u5ImmPred:$src2))]>;
+
+def ASRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = asr($src1, #$src2)",
+              [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, u6ImmPred:$src2))]>;
+
+def ASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          "$dst = asl($src1, #$src2)",
+          [(set IntRegs:$dst, (shl IntRegs:$src1, u5ImmPred:$src2))]>;
+
+def LSR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             "$dst = lsr($src1, #$src2)",
+             [(set IntRegs:$dst, (srl IntRegs:$src1, u5ImmPred:$src2))]>;
+
+def LSRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = lsr($src1, #$src2)",
+              [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, u6ImmPred:$src2))]>;
+
+def LSRd_ri_acc : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2,
+                                                     u6Imm:$src3),
+              "$dst += lsr($src2, #$src3)",
+              [(set DoubleRegs:$dst, (add DoubleRegs:$src1,
+                                          (srl DoubleRegs:$src2,
+                                           u6ImmPred:$src3)))],
+              "$src1 = $dst">;
+
+// Shift by immediate and accumulate.
+def ASR_rr_acc : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1,
+                                                     IntRegs:$src2,
+                                                     IntRegs:$src3),
+                 "$dst += asr($src2, $src3)",
+                 [], "$src1 = $dst">;
+
+// Shift by immediate and add.
+def ADDASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                             u3Imm:$src3),
+             "$dst = addasl($src1, $src2, #$src3)",
+             [(set IntRegs:$dst, (add IntRegs:$src1,
+                                      (shl IntRegs:$src2,
+                                           u3ImmPred:$src3)))]>;
+
+// Shift by register.
+def ASL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = asl($src1, $src2)",
+             [(set IntRegs:$dst, (shl IntRegs:$src1, IntRegs:$src2))]>;
+
+def ASR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = asr($src1, $src2)",
+             [(set IntRegs:$dst, (sra IntRegs:$src1, IntRegs:$src2))]>;
+
+
+def LSR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = lsr($src1, $src2)",
+             [(set IntRegs:$dst, (srl IntRegs:$src1, IntRegs:$src2))]>;
+
+def LSLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+           "$dst = lsl($src1, $src2)",
+           [(set DoubleRegs:$dst, (shl DoubleRegs:$src1, IntRegs:$src2))]>;
+
+def ASRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                 IntRegs:$src2),
+              "$dst = asr($src1, $src2)",
+              [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, IntRegs:$src2))]>;
+
+def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                 IntRegs:$src2),
+              "$dst = lsr($src1, $src2)",
+              [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, IntRegs:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VH +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VW +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VW -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/USER +
+//===----------------------------------------------------------------------===//
+def SDHexagonBARRIER: SDTypeProfile<0, 0, []>;
+def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER,
+                           [SDNPHasChain]>;
+
+let hasSideEffects = 1 in
+def BARRIER : STInst<(outs), (ins),
+                     "barrier",
+                     [(HexagonBARRIER)]>;
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER -
+//===----------------------------------------------------------------------===//
+
+// TFRI64 - assembly mapped.
+let isReMaterializable = 1 in
+def TFRI64 : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1),
+             "$dst = #$src1",
+             [(set DoubleRegs:$dst, s8Imm64Pred:$src1)]>;
+
+// Pseudo instruction to encode a set of conditional transfers.
+// This instruction is used instead of a mux and trades-off codesize
+// for performance. We conduct this transformation optimistically in
+// the hope that these instructions get promoted to dot-new transfers.
+let AddedComplexity = 100 in
+def TFR_condset_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                        IntRegs:$src2,
+                                                        IntRegs:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
+                                                 IntRegs:$src3))]>;
+
+let AddedComplexity = 100 in
+def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (select PredRegs:$src1,
+                                                 s12ImmPred:$src2,
+                                                 s12ImmPred:$src3))]>;
+
+// Generate frameindex addresses.
+let isReMaterializable = 1 in
+def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1),
+             "$dst = add($src1)",
+             [(set IntRegs:$dst, ADDRri:$src1)]>;
+
+//
+// CR - Type.
+//
+let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
+def LOOP0_i : CRInst<(outs), (ins brtarget:$offset, u10Imm:$src2),
+                      "loop0($offset, #$src2)",
+                      []>;
+}
+
+let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
+def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2),
+                      "loop0($offset, $src2)",
+                      []>;
+}
+
+let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : CRInst<(outs), (ins brtarget:$offset),
+                      ":endloop0",
+                      []>;
+}
+
+// Support for generating global address.
+// Taken from X86InstrInfo.td.
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                             SDTCisPtrTy<0>]>;
+def HexagonCONST32 : SDNode<"HexagonISD::CONST32",     SDTHexagonCONST32>;
+def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP",     SDTHexagonCONST32>;
+
+// This pattern is incorrect. When we add small data, we should change
+// this pattern to use memw(#foo).
+let isMoveImm = 1 in
+def CONST32 : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+              "$dst = CONST32(#$global)",
+              [(set IntRegs:$dst,
+              (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst = CONST32(#$global)",
+                  [(set IntRegs:$dst,
+                  (HexagonCONST32 tglobaladdr:$global))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_set_jt : LDInst<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                     "$dst = CONST32(#$jt)",
+                     [(set IntRegs:$dst,
+                     (HexagonCONST32 tjumptable:$jt))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32GP_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+                    "$dst = CONST32(#$global)",
+                    [(set IntRegs:$dst,
+                    (HexagonCONST32_GP tglobaladdr:$global))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Int_Real : LDInst<(outs IntRegs:$dst), (ins i32imm:$global),
+                       "$dst = CONST32(#$global)",
+                       [(set IntRegs:$dst, imm:$global) ]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Label : LDInst<(outs IntRegs:$dst), (ins bblabel:$label),
+                    "$dst = CONST32($label)",
+                    [(set IntRegs:$dst, (HexagonCONST32 bbl:$label))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST64_Int_Real : LDInst<(outs DoubleRegs:$dst), (ins i64imm:$global),
+                       "$dst = CONST64(#$global)",
+                       [(set DoubleRegs:$dst, imm:$global) ]>;
+
+def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins),
+                  "$dst = xor($dst, $dst)",
+                  [(set PredRegs:$dst, 0)]>;
+
+def MPY_trsext : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+                 "$dst = mpy($src1, $src2)",
+                 [(set IntRegs:$dst,
+                       (trunc (i64 (srl (i64 (mul (i64 (sext IntRegs:$src1)),
+                                             (i64 (sext IntRegs:$src2)))),
+                                        (i32 32)))))]>;
+
+// Pseudo instructions.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+
+def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                    [SDNPHasChain, SDNPOutGlue]>;
+
+def SDT_SPCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def call : SDNode<"HexagonISD::CALL", SDT_SPCall,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+// For tailcalls a HexagonTCRet SDNode has 3 SDNode Properties - a chain,
+// Optional Flag and Variable Arguments.
+// Its 1 Operand has pointer type.
+def HexagonTCRet    : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
+                     [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29] in {
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                        "Should never be emitted",
+                        [(callseq_start timm:$amt)]>;
+}
+
+let Defs = [R29, R30, R31], Uses = [R29] in {
+ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                      "Should never be emitted",
+                      [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+// Call subroutine.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALL : JInst<(outs), (ins calltarget:$dst, variable_ops),
+             "call $dst", []>;
+}
+
+// Call subroutine from register.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLR : JRInst<(outs), (ins IntRegs:$dst, variable_ops),
+              "callr $dst",
+              []>;
+ }
+
+// Tail Calls.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def TCRETURNtg : JInst<(outs), (ins calltarget:$dst, variable_ops),
+             "jump $dst // TAILCALL", []>;
+}
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def TCRETURNtext : JInst<(outs), (ins calltarget:$dst, variable_ops),
+             "jump $dst // TAILCALL", []>;
+}
+
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def TCRETURNR : JInst<(outs), (ins IntRegs:$dst, variable_ops),
+             "jumpr $dst // TAILCALL", []>;
+}
+// Map call instruction.
+def : Pat<(call IntRegs:$dst),
+      (CALLR IntRegs:$dst)>, Requires<[HasV2TOnly]>;
+def : Pat<(call tglobaladdr:$dst),
+      (CALL tglobaladdr:$dst)>, Requires<[HasV2TOnly]>;
+def : Pat<(call texternalsym:$dst),
+      (CALL texternalsym:$dst)>, Requires<[HasV2TOnly]>;
+//Tail calls.
+def : Pat<(HexagonTCRet tglobaladdr:$dst),
+      (TCRETURNtg tglobaladdr:$dst)>;
+def : Pat<(HexagonTCRet texternalsym:$dst),
+      (TCRETURNtext texternalsym:$dst)>;
+def : Pat<(HexagonTCRet IntRegs:$dst),
+      (TCRETURNR IntRegs:$dst)>;
+
+// Map from r0 = and(r1, 65535) to r0 = zxth(r1).
+def : Pat <(and IntRegs:$src1, 65535),
+      (ZXTH IntRegs:$src1)>;
+
+// Map from r0 = and(r1, 255) to r0 = zxtb(r1).
+def : Pat <(and IntRegs:$src1, 255),
+      (ZXTB IntRegs:$src1)>;
+
+// Map Add(p1, true) to p1 = not(p1).
+//     Add(p1, false) should never be produced,
+//     if it does, it got to be mapped to NOOP.
+def : Pat <(add PredRegs:$src1, -1),
+      (NOT_pp PredRegs:$src1)>;
+
+// Map from p0 = setlt(r0, r1) r2 = mux(p0, r3, r4) =>
+//   p0 = cmp.lt(r0, r1), r0 = mux(p0, r2, r1).
+def : Pat <(select (i1 (setlt IntRegs:$src1, IntRegs:$src2)), IntRegs:$src3,
+                   IntRegs:$src4),
+      (TFR_condset_rr (CMPLTrr IntRegs:$src1, IntRegs:$src2), IntRegs:$src4,
+                      IntRegs:$src3)>, Requires<[HasV2TOnly]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def : Pat <(select (not PredRegs:$src1), s8ImmPred:$src2, s8ImmPred:$src3),
+      (TFR_condset_ii PredRegs:$src1, s8ImmPred:$src3, s8ImmPred:$src2)>;
+
+// Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
+def : Pat <(brcond (not PredRegs:$src1), bb:$offset),
+      (JMP_PredNot PredRegs:$src1, bb:$offset)>;
+
+// Map from p2 = pnot(p2); p1 = and(p0, p2) => p1 = and(p0, !p2).
+def : Pat <(and PredRegs:$src1, (not PredRegs:$src2)),
+      (AND_pnotp PredRegs:$src1, PredRegs:$src2)>;
+
+// Map from store(globaladdress + x) -> memd(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(store DoubleRegs:$src1,
+                  (add (HexagonCONST32_GP tglobaladdr:$global),
+                       u16ImmPred:$offset)),
+      (STrid_GP tglobaladdr:$global, u16ImmPred:$offset, DoubleRegs:$src1)>;
+
+// Map from store(globaladdress) -> memd(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(store DoubleRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
+      (STrid_GP tglobaladdr:$global, 0, DoubleRegs:$src1)>;
+
+// Map from store(globaladdress + x) -> memw(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(store IntRegs:$src1, (add (HexagonCONST32_GP tglobaladdr:$global),
+                                      u16ImmPred:$offset)),
+      (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+
+// Map from store(globaladdress) -> memw(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
+      (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>;
+
+// Map from store(globaladdress) -> memw(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
+      (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>;
+
+// Map from store(globaladdress + x) -> memh(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(truncstorei16 IntRegs:$src1,
+                          (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+      (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+
+// Map from store(globaladdress) -> memh(#foo).
+let AddedComplexity = 100 in
+def : Pat <(truncstorei16 IntRegs:$src1,
+                          (HexagonCONST32_GP tglobaladdr:$global)),
+      (STh_GP tglobaladdr:$global, IntRegs:$src1)>;
+
+// Map from store(globaladdress + x) -> memb(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(truncstorei8 IntRegs:$src1,
+                         (add (HexagonCONST32_GP tglobaladdr:$global),
+                              u16ImmPred:$offset)),
+      (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+
+// Map from store(globaladdress) -> memb(#foo).
+let AddedComplexity = 100 in
+def : Pat <(truncstorei8 IntRegs:$src1,
+                         (HexagonCONST32_GP tglobaladdr:$global)),
+      (STb_GP tglobaladdr:$global, IntRegs:$src1)>;
+
+// Map from load(globaladdress + x) -> memw(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(load (add (HexagonCONST32_GP tglobaladdr:$global),
+                      u16ImmPred:$offset)),
+      (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+
+// Map from load(globaladdress) -> memw(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(load (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDw_GP tglobaladdr:$global)>;
+
+// Map from load(globaladdress + x) -> memd(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global),
+                           u16ImmPred:$offset))),
+      (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+
+// Map from load(globaladdress) -> memw(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
+      (LDd_GP tglobaladdr:$global)>;
+
+
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress + 0), Pd = Rd.
+let AddedComplexity = 100 in
+def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
+      (TFR_PdRs (LDrib_GP tglobaladdr:$global, 0))>;
+
+// Map from load(globaladdress + x) -> memh(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset)),
+      (LDrih_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+
+// Map from load(globaladdress) -> memh(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDrih_GP tglobaladdr:$global, 0)>;
+
+// Map from load(globaladdress + x) -> memuh(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset)),
+      (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+
+// Map from load(globaladdress) -> memuh(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDriuh_GP tglobaladdr:$global, 0)>;
+
+// Map from load(globaladdress + x) -> memuh(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset)),
+      (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+
+// Map from load(globaladdress) -> memuh(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDriuh_GP tglobaladdr:$global, 0)>;
+// Map from load(globaladdress + x) -> memub(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset)),
+      (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+
+// Map from load(globaladdress) -> memuh(#foo + 0).
+let AddedComplexity = 100 in
+def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDriub_GP tglobaladdr:$global, 0)>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset)),
+      (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+
+// Map from load(globaladdress) -> memb(#foo).
+let AddedComplexity = 100 in
+def : Pat <(extloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDb_GP tglobaladdr:$global)>;
+
+// Map from load(globaladdress) -> memb(#foo).
+let AddedComplexity = 100 in
+def : Pat <(sextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDb_GP tglobaladdr:$global)>;
+
+// Map from load(globaladdress) -> memub(#foo).
+let AddedComplexity = 100 in
+def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDub_GP tglobaladdr:$global)>;
+
+// When the Interprocedural Global Variable optimizer realizes that a
+// certain global variable takes only two constant values, it shrinks the
+// global to a boolean. Catch those loads here in the following 3 patterns.
+let AddedComplexity = 100 in
+def : Pat <(extloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDb_GP tglobaladdr:$global)>;
+
+let AddedComplexity = 100 in
+def : Pat <(sextloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDb_GP tglobaladdr:$global)>;
+
+let AddedComplexity = 100 in
+def : Pat <(zextloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDub_GP tglobaladdr:$global)>;
+
+// Map from load(globaladdress) -> memh(#foo).
+let AddedComplexity = 100 in
+def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDh_GP tglobaladdr:$global)>;
+
+// Map from load(globaladdress) -> memh(#foo).
+let AddedComplexity = 100 in
+def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDh_GP tglobaladdr:$global)>;
+
+// Map from load(globaladdress) -> memuh(#foo).
+let AddedComplexity = 100 in
+def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
+      (LDuh_GP tglobaladdr:$global)>;
+
+// Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned.
+def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)),
+      (AND_rr (LDrib ADDRriS11_0:$addr), (TFRI 0x1))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo).
+def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i32)),
+      (i64 (SXTW (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg)))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = SXTW(SXTH(Rss.lo)).
+def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i16)),
+      (i64 (SXTW (SXTH (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = SXTW(SXTB(Rss.lo)).
+def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i8)),
+      (i64 (SXTW (SXTB (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>;
+
+// We want to prevent emiting pnot's as much as possible.
+// Map brcond with an unsupported setcc to a JMP_PredNot.
+def : Pat <(brcond (i1 (setne IntRegs:$src1, IntRegs:$src2)), bb:$offset),
+      (JMP_PredNot (CMPEQrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne IntRegs:$src1, s10ImmPred:$src2)), bb:$offset),
+      (JMP_PredNot (CMPEQri IntRegs:$src1, s10ImmPred:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 -1))), bb:$offset),
+      (JMP_PredNot PredRegs:$src1, bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 0))), bb:$offset),
+      (JMP_Pred PredRegs:$src1, bb:$offset)>;
+
+def : Pat <(brcond (i1 (setlt IntRegs:$src1, s8ImmPred:$src2)), bb:$offset),
+      (JMP_PredNot (CMPGEri IntRegs:$src1, s8ImmPred:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setlt IntRegs:$src1, IntRegs:$src2)), bb:$offset),
+      (JMP_Pred (CMPLTrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)),
+                   bb:$offset),
+      (JMP_PredNot (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1),
+                   bb:$offset)>;
+
+def : Pat <(brcond (i1 (setule IntRegs:$src1, IntRegs:$src2)), bb:$offset),
+      (JMP_PredNot (CMPGTUrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)),
+                   bb:$offset),
+      (JMP_PredNot (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2),
+                   bb:$offset)>;
+
+// Map from a 64-bit select to an emulated 64-bit mux.  
+// Hexagon does not support 64-bit MUXes; so emulate with combines.
+def : Pat <(select PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+      (COMBINE_rr
+      (MUX_rr PredRegs:$src1,
+      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg),
+      (EXTRACT_SUBREG DoubleRegs:$src3, subreg_hireg)),
+      (MUX_rr PredRegs:$src1,
+      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg),
+      (EXTRACT_SUBREG DoubleRegs:$src3, subreg_loreg)))>;
+
+// Map from a 1-bit select to logical ops.
+// From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
+def : Pat <(select PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+      (OR_pp (AND_pp PredRegs:$src1, PredRegs:$src2),
+             (AND_pp (NOT_pp PredRegs:$src1), PredRegs:$src3))>;
+
+// Map Pd = load(addr) -> Rs = load(addr); Pd = Rs.
+def : Pat<(i1 (load ADDRriS11_2:$addr)),
+      (i1 (TFR_PdRs (i32 (LDrib ADDRriS11_2:$addr))))>;
+
+// Map for truncating from 64 immediates to 32 bit immediates.
+def : Pat<(i32 (trunc DoubleRegs:$src)),
+      (i32 (EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))>;
+
+// Map for truncating from i64 immediates to i1 bit immediates.
+def :  Pat<(i1 (trunc DoubleRegs:$src)),
+       (i1 (TFR_PdRs (i32(EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))))>;
+
+// Map memw(Rs) = Rdd -> memw(Rs) = Rt.
+def : Pat<(truncstorei8 DoubleRegs:$src, ADDRriS11_0:$addr),
+      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+                                                     subreg_loreg)))>;
+
+// Map memh(Rs) = Rdd -> memh(Rs) = Rt.
+def : Pat<(truncstorei16 DoubleRegs:$src, ADDRriS11_0:$addr),
+      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+                                                     subreg_loreg)))>;
+
+// Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0.
+def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+
+let AddedComplexity = 100 in
+// Map from i1 = constant<-1>; memw(CONST32(#foo)) = i1 -> r0 = 1;
+// memw(#foo) = r0
+def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+      (STb_GP tglobaladdr:$global, (TFRI 1))>;
+
+
+// Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0.
+def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+
+// Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt.
+def : Pat<(store PredRegs:$src1, ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii PredRegs:$src1, 1, 0)) )>;
+
+// Map Rdd = anyext(Rs) -> Rdd = sxtw(Rs).
+// Hexagon_TODO: We can probably use combine but that will cost 2 instructions.
+// Better way to do this?
+def : Pat<(i64 (anyext IntRegs:$src1)),
+      (i64 (SXTW IntRegs:$src1))>;
+
+// Map cmple -> cmpgt.
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle IntRegs:$src1, s10ImmPred:$src2)),
+      (i1 (NOT_Ps (CMPGTri IntRegs:$src1, s10ImmPred:$src2)))>;
+
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle IntRegs:$src1, IntRegs:$src2)),
+      (i1 (NOT_Ps (CMPGTrr IntRegs:$src1, IntRegs:$src2)))>;
+
+// Rss <= Rtt -> !(Rss > Rtt).
+def : Pat<(i1 (setle DoubleRegs:$src1, DoubleRegs:$src2)),
+      (i1 (NOT_Ps (CMPGT64rr DoubleRegs:$src1, DoubleRegs:$src2)))>;
+
+// Map cmpne -> cmpeq.
+// Hexagon_TODO: We should improve on this.
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne IntRegs:$src1, s10ImmPred:$src2)),
+      (i1 (NOT_Ps(i1 (CMPEQri IntRegs:$src1, s10ImmPred:$src2))))>;
+
+// Map cmpne(Rs) -> !cmpeqe(Rs).
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne IntRegs:$src1, IntRegs:$src2)),
+      (i1 (NOT_Ps(i1 (CMPEQrr IntRegs:$src1, IntRegs:$src2))))>;
+
+// Convert setne back to xor for hexagon since we compute w/ pred registers.
+def : Pat <(i1 (setne PredRegs:$src1, PredRegs:$src2)),
+      (i1 (XOR_pp PredRegs:$src1, PredRegs:$src2))>;
+
+// Map cmpne(Rss) -> !cmpew(Rss).
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne DoubleRegs:$src1, DoubleRegs:$src2)),
+      (i1 (NOT_Ps(i1 (CMPEHexagon4rr DoubleRegs:$src1, DoubleRegs:$src2))))>;
+
+// Map cmpge(Rs, Rt) -> !(cmpgt(Rs, Rt).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setge IntRegs:$src1, IntRegs:$src2)),
+      (i1 (NOT_Ps(i1 (CMPGTrr IntRegs:$src2, IntRegs:$src1))))>;
+
+def : Pat <(i1 (setge IntRegs:$src1, s8ImmPred:$src2)),
+      (i1 (CMPGEri IntRegs:$src1, s8ImmPred:$src2))>;
+
+// Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
+// rss >= rtt -> !(rtt > rss).
+def : Pat <(i1 (setge DoubleRegs:$src1, DoubleRegs:$src2)),
+      (i1 (NOT_Ps(i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))))>;
+
+// Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
+// rs < rt -> !(rs >= rt).
+def : Pat <(i1 (setlt IntRegs:$src1, s8ImmPred:$src2)),
+      (i1 (NOT_Ps (CMPGEri IntRegs:$src1, s8ImmPred:$src2)))>;
+
+// Map cmplt(Rs, Rt) -> cmplt(Rs, Rt).
+// rs < rt -> rs < rt. Let assembler map it.
+def : Pat <(i1 (setlt IntRegs:$src1, IntRegs:$src2)),
+      (i1 (CMPLTrr IntRegs:$src2, IntRegs:$src1))>;
+
+// Map cmplt(Rss, Rtt) -> cmpgt(Rtt, Rss).
+// rss < rtt -> (rtt > rss).
+def : Pat <(i1 (setlt DoubleRegs:$src1, DoubleRegs:$src2)),
+      (i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))>;
+
+// Map from cmpltu(Rs, Rd) -> !cmpgtu(Rs, Rd - 1).
+// rs < rt -> rt > rs.
+def : Pat <(i1 (setult IntRegs:$src1, IntRegs:$src2)),
+      (i1 (CMPGTUrr IntRegs:$src2, IntRegs:$src1))>;
+
+// Map from cmpltu(Rss, Rdd) -> !cmpgtu(Rss, Rdd - 1).
+// rs < rt -> rt > rs.
+def : Pat <(i1 (setult DoubleRegs:$src1, DoubleRegs:$src2)),
+      (i1 (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1))>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setuge IntRegs:$src1, IntRegs:$src2)),
+      (i1 (NOT_Ps (CMPGTUrr IntRegs:$src2, IntRegs:$src1)))>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)),
+      (i1 (NOT_Ps (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1)))>;
+
+// Map from cmpleu(Rs, Rs) -> !cmpgtu(Rs, Rs).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def : Pat <(i1 (setule IntRegs:$src1, IntRegs:$src2)),
+      (i1 (NOT_Ps (CMPGTUrr IntRegs:$src1, IntRegs:$src2)))>;
+
+// Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def : Pat <(i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)),
+      (i1 (NOT_Ps (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2)))>;
+
+// Sign extends.
+// i1 -> i32
+def : Pat <(i32 (sext PredRegs:$src1)),
+      (i32 (MUX_ii PredRegs:$src1, -1, 0))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i8 -> i64
+def:  Pat <(i64 (sextloadi8 ADDRriS11_0:$src1)),
+      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
+
+// Convert any-extended load back to load and sign extend.
+// i8 -> i64
+def:  Pat <(i64 (extloadi8 ADDRriS11_0:$src1)),
+      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i16 -> i64
+def:  Pat <(i64 (sextloadi16 ADDRriS11_1:$src1)),
+      (i64 (SXTW (LDrih ADDRriS11_1:$src1)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i32 -> i64
+def:  Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)),
+      (i64 (SXTW (LDriw ADDRriS11_2:$src1)))>;
+
+
+// Zero extends.
+// i1 -> i32
+def : Pat <(i32 (zext PredRegs:$src1)),
+      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+
+// i1 -> i64
+def : Pat <(i64 (zext PredRegs:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (MUX_ii PredRegs:$src1, 1, 0)))>;
+
+// i32 -> i64
+def : Pat <(i64 (zext IntRegs:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>;
+
+// i8 -> i64
+def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>;
+
+// i16 -> i64
+def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriuh ADDRriS11_1:$src1)))>;
+
+// i32 -> i64
+def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>;
+
+def:  Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)),
+      (i32 (LDriw ADDRriS11_0:$src1))>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def : Pat <(i32 (zext PredRegs:$src1)),
+      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def : Pat <(i32 (anyext PredRegs:$src1)),
+      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+
+// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
+def : Pat <(i64 (anyext PredRegs:$src1)),
+      (i64 (SXTW (i32 (MUX_ii PredRegs:$src1, 1, 0))))>;
+
+
+// Any extended 64-bit load.
+// anyext i32 -> i64
+def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>;
+
+// anyext i16 -> i64.
+def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>;
+
+// Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs).
+def : Pat<(i64 (zext IntRegs:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>;
+
+// Multiply 64-bit unsigned and use upper result.
+def : Pat <(mulhu DoubleRegs:$src1, DoubleRegs:$src2),
+      (MPYU64_acc(COMBINE_rr (TFRI 0),
+                 (EXTRACT_SUBREG
+                 (LSRd_ri(MPYU64_acc(MPYU64_acc(COMBINE_rr (TFRI 0),
+                                 (EXTRACT_SUBREG (LSRd_ri(MPYU64
+                                 (EXTRACT_SUBREG DoubleRegs:$src1,
+                                                 subreg_loreg),
+                                 (EXTRACT_SUBREG DoubleRegs:$src2,
+                                                 subreg_loreg)),
+                                  32) ,subreg_loreg)),
+                                 (EXTRACT_SUBREG DoubleRegs:$src1,
+                                                 subreg_hireg),
+                                 (EXTRACT_SUBREG DoubleRegs:$src2,
+                                                 subreg_loreg)),
+                              (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
+                              (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
+                          32),subreg_loreg)),
+                 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg),
+                 (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)
+      )>;
+
+// Multiply 64-bit signed and use upper result.
+def : Pat <(mulhs DoubleRegs:$src1, DoubleRegs:$src2),
+      (MPY64_acc(COMBINE_rr (TFRI 0),
+                 (EXTRACT_SUBREG
+                 (LSRd_ri(MPY64_acc(MPY64_acc(COMBINE_rr (TFRI 0),
+                                 (EXTRACT_SUBREG (LSRd_ri(MPYU64
+                                 (EXTRACT_SUBREG DoubleRegs:$src1,
+                                                 subreg_loreg),
+                                 (EXTRACT_SUBREG DoubleRegs:$src2,
+                                                 subreg_loreg)),
+                                  32) ,subreg_loreg)),
+                                 (EXTRACT_SUBREG DoubleRegs:$src1,
+                                                 subreg_hireg),
+                                 (EXTRACT_SUBREG DoubleRegs:$src2,
+                                                 subreg_loreg)),
+                              (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
+                              (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
+                          32),subreg_loreg)),
+                 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg),
+                 (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)
+      )>;
+
+// Hexagon specific ISD nodes.
+def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
+def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC",
+                                 SDTHexagonADJDYNALLOC>;
+// Needed to tag these instructions for stack layout.
+let usesCustomInserter = 1 in
+def ADJDYNALLOC : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1,
+                                                     s16Imm:$src2),
+                  "$dst = add($src1, #$src2)",
+                  [(set IntRegs:$dst, (Hexagon_ADJDYNALLOC IntRegs:$src1,
+                                                           s16ImmPred:$src2))]>;
+
+def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, []>;
+def Hexagon_ARGEXTEND : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>;
+def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1),
+                "$dst = $src1",
+                [(set IntRegs:$dst, (Hexagon_ARGEXTEND IntRegs:$src1))]>;
+
+let AddedComplexity = 100 in
+def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND IntRegs:$src1), i16)),
+      (TFR IntRegs:$src1)>;
+
+
+def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT : JRInst<(outs), (ins IntRegs:$src),
+                   "jumpr $src",
+                   [(HexagonBR_JT IntRegs:$src)]>;
+def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
+
+def : Pat<(HexagonWrapperJT tjumptable:$dst),
+          (CONST32_set_jt tjumptable:$dst)>;
+
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV3.td"
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV4.td"
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td
new file mode 100644
index 0000000..a73897e
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -0,0 +1,134 @@
+//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// J +
+//===----------------------------------------------------------------------===//
+// Call subroutine.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
+                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLv3 : JInst<(outs), (ins calltarget:$dst, variable_ops),
+             "call $dst", []>, Requires<[HasV3T]>;
+}
+
+//===----------------------------------------------------------------------===//
+// J -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+// Call subroutine from register.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
+                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst, variable_ops),
+              "callr $dst",
+              []>, Requires<[HasV3TOnly]>;
+ }
+
+
+// if(p?.new) jumpr:t r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if ($src1.new) jumpr:t $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+// if (!p?.new) jumpr:t r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cNotPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if (!$src1.new) jumpr:t $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+// Not taken.
+// if(p?.new) jumpr:nt r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if ($src1.new) jumpr:nt $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+// if (!p?.new) jumpr:nt r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cNotPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if (!$src1.new) jumpr:nt $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 200 in
+def MAXw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set DoubleRegs:$dst, (select (i1 (setlt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))]>,
+Requires<[HasV3T]>;
+
+let AddedComplexity = 200 in
+def MINw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set DoubleRegs:$dst, (select (i1 (setgt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))]>,
+Requires<[HasV3T]>;
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+
+
+
+//def : Pat <(brcond (i1 (seteq IntRegs:$src1, 0)), bb:$offset),
+//      (JMP_RegEzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setne IntRegs:$src1, 0)), bb:$offset),
+//      (JMP_RegNzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setle IntRegs:$src1, 0)), bb:$offset),
+//      (JMP_RegLezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setge IntRegs:$src1, 0)), bb:$offset),
+//      (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setgt IntRegs:$src1, -1)), bb:$offset),
+//      (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+
+
+// Map call instruction
+def : Pat<(call IntRegs:$dst),
+      (CALLRv3 IntRegs:$dst)>, Requires<[HasV3T]>;
+def : Pat<(call tglobaladdr:$dst),
+      (CALLv3 tglobaladdr:$dst)>, Requires<[HasV3T]>;
+def : Pat<(call texternalsym:$dst),
+      (CALLv3 texternalsym:$dst)>, Requires<[HasV3T]>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
new file mode 100644
index 0000000..24218d0
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -0,0 +1,3392 @@
+//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Hexagon V4 Architecture spec defines 8 instruction classes:
+// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
+// compiler)
+
+// LD Instructions:
+// ========================================
+// Loads (8/16/32/64 bit)
+// Deallocframe
+
+// ST Instructions:
+// ========================================
+// Stores (8/16/32/64 bit)
+// Allocframe
+
+// ALU32 Instructions:
+// ========================================
+// Arithmetic / Logical (32 bit)
+// Vector Halfword
+
+// XTYPE Instructions (32/64 bit):
+// ========================================
+// Arithmetic, Logical, Bit Manipulation
+// Multiply (Integer, Fractional, Complex)
+// Permute / Vector Permute Operations
+// Predicate Operations
+// Shift / Shift with Add/Sub/Logical
+// Vector Byte ALU
+// Vector Halfword (ALU, Shift, Multiply)
+// Vector Word (ALU, Shift)
+
+// J Instructions:
+// ========================================
+// Jump/Call PC-relative
+
+// JR Instructions:
+// ========================================
+// Jump/Call Register
+
+// MEMOP Instructions:
+// ========================================
+// Operation on memory (8/16/32 bit)
+
+// NV Instructions:
+// ========================================
+// New-value Jumps
+// New-value Stores
+
+// CR Instructions:
+// ========================================
+// Control-Register Transfers
+// Hardware Loop Setup
+// Predicate Logicals & Reductions
+
+// SYSTEM Instructions (not implemented in the compiler):
+// ========================================
+// Prefetch
+// Cache Maintenance
+// Bus Operations
+
+
+//===----------------------------------------------------------------------===//
+// ALU32 +
+//===----------------------------------------------------------------------===//
+
+// Shift halfword.
+
+def ASLH_cPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1) $dst = aslh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def ASLH_cNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1) $dst = aslh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def ASLH_cdnPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1.new) $dst = aslh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def ASLH_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1.new) $dst = aslh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def ASRH_cPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1) $dst = asrh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def ASRH_cNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1) $dst = asrh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def ASRH_cdnPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1.new) $dst = asrh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def ASRH_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1.new) $dst = asrh($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Sign extend.
+
+def SXTB_cPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1) $dst = sxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def SXTB_cNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1) $dst = sxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def SXTB_cdnPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1.new) $dst = sxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def SXTB_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1.new) $dst = sxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+def SXTH_cPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1) $dst = sxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def SXTH_cNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1) $dst = sxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def SXTH_cdnPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1.new) $dst = sxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def SXTH_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1.new) $dst = sxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Zero exten.
+
+let neverHasSideEffects = 1 in
+def ZXTB_cPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1) $dst = zxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1 in
+def ZXTB_cNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1) $dst = zxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1 in
+def ZXTB_cdnPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1.new) $dst = zxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1 in
+def ZXTB_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1.new) $dst = zxtb($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1 in
+def ZXTH_cPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1) $dst = zxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1 in
+def ZXTH_cNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1) $dst = zxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1 in
+def ZXTH_cdnPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if ($src1.new) $dst = zxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1 in
+def ZXTH_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2),
+            "if (!$src1.new) $dst = zxth($src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+//===----------------------------------------------------------------------===//
+// ALU32 -
+//===----------------------------------------------------------------------===//
+
+
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+///
+/// Make sure that in post increment load, the first operand is always the post
+/// increment operand.
+///
+//// Load doubleword.
+// Rdd=memd(Re=#U6)
+
+// Rdd=memd(Rs+Rt<<#u2)
+// Special case pattern for indexed load without offset which is easier to
+// match. AddedComplexity of this pattern should be lower than base+offset load
+// and lower yet than the more generic version with offset/shift below
+// Similar approach is taken for all other base+index loads.
+let AddedComplexity = 10, isPredicable = 1 in
+def LDrid_indexed_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memd($src1+$src2<<#0)",
+                    [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
+                                                      IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 40, isPredicable = 1 in
+def LDrid_indexed_shl_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memd($src1+$src2<<#$offset)",
+                    [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
+                                                 (shl IntRegs:$src2,
+                                                      u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+//// Load doubleword conditionally.
+// if ([!]Pv[.new]) Rd=memd(Rs+Rt<<#u2)
+// if (Pv) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrid_indexed_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1) $dst=memd($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrid_indexed_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1.new) $dst=memd($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrid_indexed_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1) $dst=memd($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrid_indexed_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1.new) $dst=memd($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrid_indexed_shl_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1) $dst=memd($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrid_indexed_shl_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1.new) $dst=memd($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrid_indexed_shl_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1) $dst=memd($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memd(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrid_indexed_shl_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1.new) $dst=memd($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// Rdd=memd(Rt<<#u2+#U6)
+
+//// Load byte.
+// Rd=memb(Re=#U6)
+
+// Rd=memb(Rs+Rt<<#u2)
+let AddedComplexity = 10, isPredicable = 1 in
+def LDrib_indexed_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memb($src1+$src2<<#0)",
+                    [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1,
+                                                         IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 10, isPredicable = 1 in
+def LDriub_indexed_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memub($src1+$src2<<#0)",
+                    [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1,
+                                                         IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 10, isPredicable = 1 in
+def LDriub_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memub($src1+$src2<<#0)",
+                    [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
+                                                        IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 40, isPredicable = 1 in
+def LDrib_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memb($src1+$src2<<#$offset)",
+                    [(set IntRegs:$dst,
+                          (sextloadi8 (add IntRegs:$src1,
+                                           (shl IntRegs:$src2,
+                                                u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 40, isPredicable = 1 in
+def LDriub_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memub($src1+$src2<<#$offset)",
+                    [(set IntRegs:$dst,
+                          (zextloadi8 (add IntRegs:$src1,
+                                           (shl IntRegs:$src2,
+                                                u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 40, isPredicable = 1 in
+def LDriub_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memub($src1+$src2<<#$offset)",
+                    [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
+                                                   (shl IntRegs:$src2,
+                                                        u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+//// Load byte conditionally.
+// if ([!]Pv[.new]) Rd=memb(Rs+Rt<<#u2)
+// if (Pv) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrib_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1) $dst=memb($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrib_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1.new) $dst=memb($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrib_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1) $dst=memb($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrib_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1.new) $dst=memb($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrib_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1) $dst=memb($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrib_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1.new) $dst=memb($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrib_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1) $dst=memb($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memb(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrib_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1.new) $dst=memb($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+//// Load unsigned byte conditionally.
+// if ([!]Pv[.new]) Rd=memub(Rs+Rt<<#u2)
+// if (Pv) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriub_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1) $dst=memub($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriub_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1.new) $dst=memub($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriub_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1) $dst=memub($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriub_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1.new) $dst=memub($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriub_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1) $dst=memub($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriub_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1.new) $dst=memub($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriub_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1) $dst=memub($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memub(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriub_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1.new) $dst=memub($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// Rd=memb(Rt<<#u2+#U6)
+
+//// Load halfword
+// Rd=memh(Re=#U6)
+
+// Rd=memh(Rs+Rt<<#u2)
+let AddedComplexity = 10, isPredicable = 1 in
+def LDrih_indexed_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memh($src1+$src2<<#0)",
+                    [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1,
+                                                          IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 10, isPredicable = 1 in
+def LDriuh_indexed_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memuh($src1+$src2<<#0)",
+                    [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1,
+                                                          IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 10, isPredicable = 1 in
+def LDriuh_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memuh($src1+$src2<<#0)",
+                    [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
+                                                         IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+// Rd=memh(Rs+Rt<<#u2)
+let AddedComplexity = 40, isPredicable = 1 in
+def LDrih_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memh($src1+$src2<<#$offset)",
+                    [(set IntRegs:$dst,
+                          (sextloadi16 (add IntRegs:$src1,
+                                            (shl IntRegs:$src2,
+                                                 u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 40, isPredicable = 1 in
+def LDriuh_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memuh($src1+$src2<<#$offset)",
+                    [(set IntRegs:$dst,
+                          (zextloadi16 (add IntRegs:$src1,
+                                            (shl IntRegs:$src2,
+                                                 u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+let AddedComplexity = 40, isPredicable = 1 in
+def LDriuh_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memuh($src1+$src2<<#$offset)",
+                    [(set IntRegs:$dst,
+                          (extloadi16 (add IntRegs:$src1,
+                                           (shl IntRegs:$src2,
+                                                u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+//// Load halfword conditionally.
+// if ([!]Pv[.new]) Rd=memh(Rs+Rt<<#u2)
+// if (Pv) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrih_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1) $dst=memh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrih_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1.new) $dst=memh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrih_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1) $dst=memh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDrih_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1.new) $dst=memh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrih_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1) $dst=memh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrih_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1.new) $dst=memh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrih_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1) $dst=memh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDrih_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1.new) $dst=memh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+//// Load unsigned halfword conditionally.
+// if ([!]Pv[.new]) Rd=memuh(Rs+Rt<<#u2)
+// if (Pv) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriuh_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1) $dst=memuh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriuh_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1.new) $dst=memuh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriuh_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1) $dst=memuh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriuh_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1.new) $dst=memuh($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriuh_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1) $dst=memuh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriuh_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1.new) $dst=memuh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriuh_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1) $dst=memuh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memuh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1.new) $dst=memuh($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// Rd=memh(Rt<<#u2+#U6)
+
+//// Load word.
+// Rd=memw(Re=#U6)
+
+// Rd=memw(Rs+Rt<<#u2)
+let AddedComplexity = 10, isPredicable = 1 in
+def LDriw_indexed_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2),
+                    "$dst=memw($src1+$src2<<#0)",
+                    [(set IntRegs:$dst, (load (add IntRegs:$src1,
+                                                   IntRegs:$src2)))]>,
+                    Requires<[HasV4T]>;
+
+// Rd=memw(Rs+Rt<<#u2)
+let AddedComplexity = 40, isPredicable = 1 in
+def LDriw_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+                    "$dst=memw($src1+$src2<<#$offset)",
+                    [(set IntRegs:$dst, (load (add IntRegs:$src1,
+                                              (shl IntRegs:$src2,
+                                                   u2ImmPred:$offset))))]>,
+                    Requires<[HasV4T]>;
+
+//// Load word conditionally.
+// if ([!]Pv[.new]) Rd=memw(Rs+Rt<<#u2)
+// if (Pv) Rd=memw(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriw_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1) $dst=memw($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriw_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if ($src1.new) $dst=memw($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriw_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1) $dst=memw($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 15 in
+def LDriw_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+                    "if (!$src1.new) $dst=memw($src2+$src3<<#0)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriw_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1) $dst=memw($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriw_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if ($src1.new) $dst=memw($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriw_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1) $dst=memw($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
+let mayLoad = 1, AddedComplexity = 45 in
+def LDriw_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
+                         u2Imm:$offset),
+                    "if (!$src1.new) $dst=memw($src2+$src3<<#$offset)",
+                    []>,
+                    Requires<[HasV4T]>;
+
+// Rd=memw(Rt<<#u2+#U6)
+
+
+// Post-inc Load, Predicated, Dot new
+
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrid_cdnPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
+            "if ($src1.new) $dst1 = memd($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrid_cdnNotPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
+            "if (!$src1.new) $dst1 = memd($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrib_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if ($src1.new) $dst1 = memb($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrib_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if (!$src1.new) $dst1 = memb($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrih_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if ($src1.new) $dst1 = memh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrih_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if (!$src1.new) $dst1 = memh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriub_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if ($src1.new) $dst1 = memub($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriub_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
+            "if (!$src1.new) $dst1 = memub($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriuh_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if ($src1.new) $dst1 = memuh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriuh_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
+            "if (!$src1.new) $dst1 = memuh($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriw_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
+            "if ($src1.new) $dst1 = memw($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriw_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
+            "if (!$src1.new) $dst1 = memw($src2++#$src3)",
+            [],
+            "$src2 = $dst2">,
+            Requires<[HasV4T]>;
+
+
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+/// Assumptions::: ****** DO NOT IGNORE ********
+/// 1. Make sure that in post increment store, the zero'th operand is always the
+///    post increment operand.
+/// 2. Make sure that the store value operand(Rt/Rtt) in a store is always the
+///    last operand.
+///
+
+// Store doubleword.
+// memd(Re=#U6)=Rtt
+// TODO: needs to be implemented
+
+// memd(Rs+#s11:3)=Rtt
+// memd(Rs+Ru<<#u2)=Rtt
+let AddedComplexity = 10, isPredicable = 1 in
+def STrid_indexed_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, DoubleRegs:$src4),
+            "memd($src1+$src2<<#$src3) = $src4",
+            [(store DoubleRegs:$src4, (add IntRegs:$src1,
+                                      (shl IntRegs:$src2, u2ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memd(Ru<<#u2+#U6)=Rtt
+let AddedComplexity = 10 in
+def STrid_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, DoubleRegs:$src4),
+            "memd($src1<<#$src2+#$src3) = $src4",
+            [(store DoubleRegs:$src4, (shl IntRegs:$src1,
+                                      (add u2ImmPred:$src2,
+                                           u6ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memd(Rx++#s4:3)=Rtt
+// memd(Rx++#s4:3:circ(Mu))=Rtt
+// memd(Rx++I:circ(Mu))=Rtt
+// memd(Rx++Mu)=Rtt
+// memd(Rx++Mu:brev)=Rtt
+// memd(gp+#u16:3)=Rtt
+
+// Store doubleword conditionally.
+// if ([!]Pv[.new]) memd(#u6)=Rtt
+// TODO: needs to be implemented.
+
+// if ([!]Pv[.new]) memd(Rs+#u6:3)=Rtt
+// if (Pv) memd(Rs+#u6:3)=Rtt
+// if (Pv.new) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
+            "if ($src1.new) memd($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memd(Rs+#u6:3)=Rtt
+// if (!Pv.new) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
+            "if (!$src1.new) memd($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memd(Rs+#u6:3)=Rtt
+// if (Pv.new) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
+                 DoubleRegs:$src4),
+            "if ($src1.new) memd($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memd(Rs+#u6:3)=Rtt
+// if (!Pv.new) memd(Rs+#u6:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
+                 DoubleRegs:$src4),
+            "if (!$src1.new) memd($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memd(Rs+Ru<<#u2)=Rtt
+// if (Pv) memd(Rs+Ru<<#u2)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_shl_cPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 DoubleRegs:$src5),
+            "if ($src1) memd($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memd(Rs+Ru<<#u2)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_shl_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 DoubleRegs:$src5),
+            "if ($src1) memd($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+// if (!Pv) memd(Rs+Ru<<#u2)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_shl_cNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 DoubleRegs:$src5),
+            "if (!$src1) memd($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+// if (!Pv.new) memd(Rs+Ru<<#u2)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def STrid_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 DoubleRegs:$src5),
+            "if (!$src1.new) memd($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memd(Rx++#s4:3)=Rtt
+// if (Pv) memd(Rx++#s4:3)=Rtt
+// if (Pv.new) memd(Rx++#s4:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def POST_STdri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
+                 s4_3Imm:$offset),
+            "if ($src1.new) memd($src3++#$offset) = $src2",
+            [],
+            "$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memd(Rx++#s4:3)=Rtt
+// if (!Pv.new) memd(Rx++#s4:3)=Rtt
+let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
+def POST_STdri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
+                 s4_3Imm:$offset),
+            "if (!$src1.new) memd($src3++#$offset) = $src2",
+            [],
+            "$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+
+// Store byte.
+// memb(Re=#U6)=Rt
+// TODO: needs to be implemented.
+// memb(Rs+#s11:0)=Rt
+// memb(Rs+#u6:0)=#S8
+let AddedComplexity = 10, isPredicable = 1 in
+def STrib_imm_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_0Imm:$src2, s8Imm:$src3),
+            "memb($src1+#$src2) = #$src3",
+            [(truncstorei8 s8ImmPred:$src3, (add IntRegs:$src1,
+                                                 u6_0ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// memb(Rs+Ru<<#u2)=Rt
+let AddedComplexity = 10, isPredicable = 1 in
+def STrib_indexed_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
+            "memb($src1+$src2<<#$src3) = $src4",
+            [(truncstorei8 IntRegs:$src4, (add IntRegs:$src1,
+                                          (shl IntRegs:$src2,
+                                               u2ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memb(Ru<<#u2+#U6)=Rt
+let AddedComplexity = 10 in
+def STrib_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            "memb($src1<<#$src2+#$src3) = $src4",
+            [(truncstorei8 IntRegs:$src4, (shl IntRegs:$src1,
+                                          (add u2ImmPred:$src2,
+                                               u6ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memb(Rx++#s4:0:circ(Mu))=Rt
+// memb(Rx++I:circ(Mu))=Rt
+// memb(Rx++Mu)=Rt
+// memb(Rx++Mu:brev)=Rt
+// memb(gp+#u16:0)=Rt
+
+
+// Store byte conditionally.
+// if ([!]Pv[.new]) memb(#u6)=Rt
+// if ([!]Pv[.new]) memb(Rs+#u6:0)=#S6
+// if (Pv) memb(Rs+#u6:0)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_imm_cPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
+            "if ($src1) memb($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memb(Rs+#u6:0)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_imm_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
+            "if ($src1.new) memb($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rs+#u6:0)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_imm_cNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
+            "if (!$src1) memb($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memb(Rs+#u6:0)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_imm_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
+            "if (!$src1.new) memb($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memb(Rs+#u6:0)=Rt
+// if (Pv) memb(Rs+#u6:0)=Rt
+// if (Pv.new) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1.new) memb($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rs+#u6:0)=Rt
+// if (!Pv.new) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1.new) memb($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memb(Rs+#u6:0)=Rt
+// if (!Pv) memb(Rs+#u6:0)=Rt
+// if (Pv.new) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if ($src1.new) memb($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memb(Rs+#u6:0)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if (!$src1.new) memb($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Rt
+// if (Pv) memb(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1) memb($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memb(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1.new) memb($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1) memb($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memb(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1.new) memb($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memb(Rx++#s4:0)=Rt
+// if (Pv) memb(Rx++#s4:0)=Rt
+// if (Pv.new) memb(Rx++#s4:0)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if ($src1.new) memb($src3++#$offset) = $src2",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rx++#s4:0)=Rt
+// if (!Pv.new) memb(Rx++#s4:0)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if (!$src1.new) memb($src3++#$offset) = $src2",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+
+// Store halfword.
+// memh(Re=#U6)=Rt.H
+// TODO: needs to be implemented
+
+// memh(Re=#U6)=Rt
+// TODO: needs to be implemented
+
+// memh(Rs+#s11:1)=Rt.H
+// memh(Rs+#s11:1)=Rt
+// memh(Rs+#u6:1)=#S8
+let AddedComplexity = 10, isPredicable = 1 in
+def STrih_imm_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_1Imm:$src2, s8Imm:$src3),
+            "memh($src1+#$src2) = #$src3",
+            [(truncstorei16 s8ImmPred:$src3, (add IntRegs:$src1,
+                                                  u6_1ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// memh(Rs+Ru<<#u2)=Rt.H
+// TODO: needs to be implemented.
+
+// memh(Rs+Ru<<#u2)=Rt
+let AddedComplexity = 10, isPredicable = 1 in
+def STrih_indexed_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
+            "memh($src1+$src2<<#$src3) = $src4",
+            [(truncstorei16 IntRegs:$src4, (add IntRegs:$src1,
+                                          (shl IntRegs:$src2,
+                                               u2ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memh(Ru<<#u2+#U6)=Rt.H
+// memh(Ru<<#u2+#U6)=Rt
+let AddedComplexity = 10 in
+def STrih_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            "memh($src1<<#$src2+#$src3) = $src4",
+            [(truncstorei16 IntRegs:$src4, (shl IntRegs:$src1,
+                                          (add u2ImmPred:$src2,
+                                               u6ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memh(Rx++#s4:1:circ(Mu))=Rt.H
+// memh(Rx++#s4:1:circ(Mu))=Rt
+// memh(Rx++I:circ(Mu))=Rt.H
+// memh(Rx++I:circ(Mu))=Rt
+// memh(Rx++Mu)=Rt.H
+// memh(Rx++Mu)=Rt
+// memh(Rx++Mu:brev)=Rt.H
+// memh(Rx++Mu:brev)=Rt
+// memh(gp+#u16:1)=Rt.H
+// memh(gp+#u16:1)=Rt
+
+
+// Store halfword conditionally.
+// if ([!]Pv[.new]) memh(#u6)=Rt.H
+// if ([!]Pv[.new]) memh(#u6)=Rt
+
+// if ([!]Pv[.new]) memh(Rs+#u6:1)=#S6
+// if (Pv) memh(Rs+#u6:1)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_imm_cPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
+            "if ($src1) memh($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memh(Rs+#u6:1)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_imm_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
+            "if ($src1.new) memh($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rs+#u6:1)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_imm_cNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
+            "if (!$src1) memh($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memh(Rs+#u6:1)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_imm_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
+            "if (!$src1.new) memh($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt.H
+// TODO: needs to be implemented.
+
+// if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt
+// if (Pv) memh(Rs+#u6:1)=Rt
+// if (Pv.new) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1.new) memh($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rs+#u6:1)=Rt
+// if (!Pv.new) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1.new) memh($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if ($src1.new) memh($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memh(Rs+#u6:1)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if (!$src1.new) memh($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt.H
+// if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt
+// if (Pv) memh(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_indexed_shl_cPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1) memh($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memh(Rs+Ru<<#u2)=Rt
+def STrih_indexed_shl_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1.new) memh($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_indexed_shl_cNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1) memh($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memh(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1.new) memh($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt.H
+// TODO: Needs to be implemented.
+
+// if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt
+// if (Pv) memh(Rx++#s4:1)=Rt
+// if (Pv.new) memh(Rx++#s4:1)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if ($src1.new) memh($src3++#$offset) = $src2",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rx++#s4:1)=Rt
+// if (!Pv.new) memh(Rx++#s4:1)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if (!$src1.new) memh($src3++#$offset) = $src2",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+
+// Store word.
+// memw(Re=#U6)=Rt
+// TODO: Needs to be implemented.
+
+// memw(Rs+#s11:2)=Rt
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10, isPredicable = 1 in
+def STriw_imm_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_2Imm:$src2, s8Imm:$src3),
+            "memw($src1+#$src2) = #$src3",
+            [(store s8ImmPred:$src3, (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// memw(Rs+Ru<<#u2)=Rt
+let AddedComplexity = 10, isPredicable = 1 in
+def STriw_indexed_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
+            "memw($src1+$src2<<#$src3) = $src4",
+            [(store IntRegs:$src4, (add IntRegs:$src1,
+                                    (shl IntRegs:$src2, u2ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memw(Ru<<#u2+#U6)=Rt
+let AddedComplexity = 10 in
+def STriw_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            "memw($src1<<#$src2+#$src3) = $src4",
+            [(store IntRegs:$src4, (shl IntRegs:$src1,
+                                   (add u2ImmPred:$src2, u6ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// memw(Rx++#s4:2)=Rt
+// memw(Rx++#s4:2:circ(Mu))=Rt
+// memw(Rx++I:circ(Mu))=Rt
+// memw(Rx++Mu)=Rt
+// memw(Rx++Mu:brev)=Rt
+// memw(gp+#u16:2)=Rt
+
+
+// Store word conditionally.
+// if ([!]Pv[.new]) memw(#u6)=Rt
+// TODO: Needs to be implemented.
+
+// if ([!]Pv[.new]) memw(Rs+#u6:2)=#S6
+// if (Pv) memw(Rs+#u6:2)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_imm_cPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
+            "if ($src1) memw($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memw(Rs+#u6:2)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_imm_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
+            "if ($src1.new) memw($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rs+#u6:2)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_imm_cNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
+            "if (!$src1) memw($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memw(Rs+#u6:2)=#S6
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_imm_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
+            "if (!$src1.new) memw($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memw(Rs+#u6:2)=Rt
+// if (Pv) memw(Rs+#u6:2)=Rt
+// if (Pv.new) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1.new) memw($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rs+#u6:2)=Rt
+// if (!Pv.new) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1.new) memw($addr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memw(Rs+#u6:2)=Rt
+// if (!Pv) memw(Rs+#u6:2)=Rt
+// if (Pv.new) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if ($src1.new) memw($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memw(Rs+#u6:2)=Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if (!$src1.new) memw($src2+#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Rt
+// if (Pv) memw(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1) memw($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memw(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cdnPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1.new) memw($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1) memw($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memw(Rs+Ru<<#u2)=Rt
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1.new) memw($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memw(Rx++#s4:2)=Rt
+// if (Pv) memw(Rx++#s4:2)=Rt
+// if (Pv.new) memw(Rx++#s4:2)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if ($src1.new) memw($src3++#$offset) = $src2",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rx++#s4:2)=Rt
+// if (!Pv.new) memw(Rx++#s4:2)=Rt
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if (!$src1.new) memw($src3++#$offset) = $src2",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//===----------------------------------------------------------------------===
+// ST -
+//===----------------------------------------------------------------------===
+
+
+//===----------------------------------------------------------------------===//
+// NV/ST +
+//===----------------------------------------------------------------------===//
+
+// Store new-value byte.
+
+// memb(Re=#U6)=Nt.new
+// memb(Rs+#s11:0)=Nt.new
+let mayStore = 1, isPredicable = 1 in
+def STrib_nv_V4 : NVInst_V4<(outs), (ins MEMri:$addr, IntRegs:$src1),
+            "memb($addr) = $src1.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, isPredicable = 1 in
+def STrib_indexed_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, s11_0Imm:$src2, IntRegs:$src3),
+            "memb($src1+#$src2) = $src3.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10, isPredicable = 1 in
+def STrib_indexed_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
+            "memb($src1+$src2<<#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Ru<<#u2+#U6)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            "memb($src1<<#$src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Rx++#s4:0)=Nt.new
+let mayStore = 1, hasCtrlDep = 1, isPredicable = 1  in
+def POST_STbri_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, s4_0Imm:$offset),
+            "memb($src2++#$offset) = $src1.new",
+            [],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// memb(Rx++#s4:0:circ(Mu))=Nt.new
+// memb(Rx++I:circ(Mu))=Nt.new
+// memb(Rx++Mu)=Nt.new
+// memb(Rx++Mu:brev)=Nt.new
+
+// memb(gp+#u16:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memb(#$global+$offset) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// Store new-value byte conditionally.
+// if ([!]Pv[.new]) memb(#u6)=Nt.new
+// if (Pv) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1) memb($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1.new) memb($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1) memb($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1.new) memb($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if ($src1) memb($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if ($src1.new) memb($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if (!$src1) memb($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memb(Rs+#u6:0)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
+            "if (!$src1.new) memb($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Nt.new
+// if (Pv) memb(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1) memb($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memb(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1.new) memb($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1) memb($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memb(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrib_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1.new) memb($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memb(Rx++#s4:0)=Nt.new
+// if (Pv) memb(Rx++#s4:0)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if ($src1) memb($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memb(Rx++#s4:0)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if ($src1.new) memb($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(Rx++#s4:0)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if (!$src1) memb($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memb(Rx++#s4:0)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STbri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
+            "if (!$src1.new) memb($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+
+// Store new-value halfword.
+// memh(Re=#U6)=Nt.new
+// memh(Rs+#s11:1)=Nt.new
+let mayStore = 1, isPredicable = 1 in
+def STrih_nv_V4 : NVInst_V4<(outs), (ins MEMri:$addr, IntRegs:$src1),
+            "memh($addr) = $src1.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, isPredicable = 1 in
+def STrih_indexed_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, s11_1Imm:$src2, IntRegs:$src3),
+            "memh($src1+#$src2) = $src3.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10, isPredicable = 1 in
+def STrih_indexed_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
+            "memh($src1+$src2<<#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Ru<<#u2+#U6)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            "memh($src1<<#$src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Rx++#s4:1)=Nt.new
+let mayStore = 1, hasCtrlDep = 1, isPredicable = 1  in
+def POST_SThri_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, s4_1Imm:$offset),
+            "memh($src2++#$offset) = $src1.new",
+            [],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// memh(Rx++#s4:1:circ(Mu))=Nt.new
+// memh(Rx++I:circ(Mu))=Nt.new
+// memh(Rx++Mu)=Nt.new
+// memh(Rx++Mu:brev)=Nt.new
+
+// memh(gp+#u16:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memh(#$global+$offset) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// Store new-value halfword conditionally.
+
+// if ([!]Pv[.new]) memh(#u6)=Nt.new
+
+// if ([!]Pv[.new]) memh(Rs+#u6:1)=Nt.new
+// if (Pv) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1) memh($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1.new) memh($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1) memh($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1.new) memh($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if ($src1) memh($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if ($src1.new) memh($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if (!$src1) memh($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memh(Rs+#u6:1)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
+            "if (!$src1.new) memh($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Nt.new
+// if (Pv) memh(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1) memh($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memh(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1.new) memh($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1) memh($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memh(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STrih_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1.new) memh($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[]) memh(Rx++#s4:1)=Nt.new
+// if (Pv) memh(Rx++#s4:1)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if ($src1) memh($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memh(Rx++#s4:1)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if ($src1.new) memh($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(Rx++#s4:1)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if (!$src1) memh($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memh(Rx++#s4:1)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_SThri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
+            "if (!$src1.new) memh($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+
+// Store new-value word.
+
+// memw(Re=#U6)=Nt.new
+// memw(Rs+#s11:2)=Nt.new
+let mayStore = 1, isPredicable = 1 in
+def STriw_nv_V4 : NVInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memw($addr) = $src1.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, isPredicable = 1 in
+def STriw_indexed_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
+            "memw($src1+#$src2) = $src3.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10, isPredicable = 1 in
+def STriw_indexed_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
+            "memw($src1+$src2<<#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Ru<<#u2+#U6)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            "memw($src1<<#$src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Rx++#s4:2)=Nt.new
+let mayStore = 1, hasCtrlDep = 1, isPredicable = 1  in
+def POST_STwri_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, s4_2Imm:$offset),
+            "memw($src2++#$offset) = $src1.new",
+            [],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// memw(Rx++#s4:2:circ(Mu))=Nt.new
+// memw(Rx++I:circ(Mu))=Nt.new
+// memw(Rx++Mu)=Nt.new
+// memw(Rx++Mu:brev)=Nt.new
+// memw(gp+#u16:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memw(#$global+$offset) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// Store new-value word conditionally.
+
+// if ([!]Pv[.new]) memw(#u6)=Nt.new
+
+// if ([!]Pv[.new]) memw(Rs+#u6:2)=Nt.new
+// if (Pv) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1) memw($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if ($src1.new) memw($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1) memw($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
+            "if (!$src1.new) memw($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if ($src1) memw($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if ($src1.new) memw($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if (!$src1) memw($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memw(Rs+#u6:2)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
+            "if (!$src1.new) memw($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Nt.new
+// if (Pv) memw(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1) memw($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memw(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if ($src1.new) memw($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1) memw($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memw(Rs+Ru<<#u2)=Nt.new
+let mayStore = 1, AddedComplexity = 10 in
+def STriw_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 IntRegs:$src5),
+            "if (!$src1.new) memw($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if ([!]Pv[.new]) memw(Rx++#s4:2)=Nt.new
+// if (Pv) memw(Rx++#s4:2)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if ($src1) memw($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (Pv.new) memw(Rx++#s4:2)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if ($src1.new) memw($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(Rx++#s4:2)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if (!$src1) memw($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+// if (!Pv.new) memw(Rx++#s4:2)=Nt.new
+let mayStore = 1, hasCtrlDep = 1 in
+def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
+            "if (!$src1.new) memw($src3++#$offset) = $src2.new",
+            [],"$src3 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//===----------------------------------------------------------------------===//
+// NV/ST -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU +
+//===----------------------------------------------------------------------===//
+
+//  Add and accumulate.
+//  Rd=add(Rs,add(Ru,#s6))
+def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
+          "$dst = add($src1, add($src2, #$src3))",
+          [(set IntRegs:$dst,
+           (add IntRegs:$src1, (add IntRegs:$src2, s6ImmPred:$src3)))]>,
+          Requires<[HasV4T]>;
+
+//  Rd=add(Rs,sub(#s6,Ru))
+def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
+          "$dst = add($src1, sub(#$src2, $src3))",
+          [(set IntRegs:$dst,
+           (add IntRegs:$src1, (sub s6ImmPred:$src2, IntRegs:$src3)))]>,
+          Requires<[HasV4T]>;
+
+// Generates the same instruction as ADDr_SUBri_V4 but matches different
+// pattern.
+//  Rd=add(Rs,sub(#s6,Ru))
+def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
+          "$dst = add($src1, sub(#$src2, $src3))",
+          [(set IntRegs:$dst,
+           (sub (add IntRegs:$src1, s6ImmPred:$src2), IntRegs:$src3))]>,
+          Requires<[HasV4T]>;
+
+
+//  Add or subtract doublewords with carry.
+//TODO:
+//  Rdd=add(Rss,Rtt,Px):carry
+//TODO:
+//  Rdd=sub(Rss,Rtt,Px):carry
+
+
+//  Logical doublewords.
+//  Rdd=and(Rtt,~Rss)
+def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          "$dst = and($src1, ~$src2)",
+          [(set DoubleRegs:$dst, (and DoubleRegs:$src1,
+                                      (not DoubleRegs:$src2)))]>,
+          Requires<[HasV4T]>;
+
+//  Rdd=or(Rtt,~Rss)
+def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          "$dst = or($src1, ~$src2)",
+          [(set DoubleRegs:$dst,
+           (or DoubleRegs:$src1, (not DoubleRegs:$src2)))]>,
+          Requires<[HasV4T]>;
+
+
+//  Logical-logical doublewords.
+//  Rxx^=xor(Rss,Rtt)
+def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+          "$dst ^= xor($src2, $src3)",
+          [(set DoubleRegs:$dst,
+           (xor DoubleRegs:$src1, (xor DoubleRegs:$src2, DoubleRegs:$src3)))],
+          "$src1 = $dst">,
+          Requires<[HasV4T]>;
+
+
+// Logical-logical words.
+// Rx=or(Ru,and(Rx,#s10))
+def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
+            "$dst = or($src1, and($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=and(Rs,Rt)
+// Rx&=and(Rs,Rt)
+def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= and($src2, $src3)",
+            [(set IntRegs:$dst,
+             (and IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,Rt)
+def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= and($src2, $src3)",
+            [(set IntRegs:$dst,
+             (or IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=and(Rs,Rt)
+def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= and($src2, $src3)",
+            [(set IntRegs:$dst,
+             (xor IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=and(Rs,~Rt)
+// Rx&=and(Rs,~Rt)
+def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= and($src2, ~$src3)",
+            [(set IntRegs:$dst,
+             (and IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,~Rt)
+def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= and($src2, ~$src3)",
+            [(set IntRegs:$dst,
+             (or IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=and(Rs,~Rt)
+def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= and($src2, ~$src3)",
+            [(set IntRegs:$dst,
+             (xor IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=or(Rs,Rt)
+// Rx&=or(Rs,Rt)
+def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= or($src2, $src3)",
+            [(set IntRegs:$dst,
+             (and IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=or(Rs,Rt)
+def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= or($src2, $src3)",
+            [(set IntRegs:$dst,
+             (or IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=or(Rs,Rt)
+def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= or($src2, $src3)",
+            [(set IntRegs:$dst,
+             (xor IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=xor(Rs,Rt)
+// Rx&=xor(Rs,Rt)
+def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= xor($src2, $src3)",
+            [(set IntRegs:$dst,
+             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=xor(Rs,Rt)
+def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= xor($src2, $src3)",
+            [(set IntRegs:$dst,
+             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=xor(Rs,Rt)
+def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= xor($src2, $src3)",
+            [(set IntRegs:$dst,
+             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,#s10)
+def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
+            "$dst |= and($src2, #$src3)",
+            [(set IntRegs:$dst,
+             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=or(Rs,#s10)
+def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
+            "$dst |= or($src2, #$src3)",
+            [(set IntRegs:$dst,
+             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//    Modulo wrap
+//        Rd=modwrap(Rs,Rt)
+//    Round
+//        Rd=cround(Rs,#u5)
+//        Rd=cround(Rs,Rt)
+//        Rd=round(Rs,#u5)[:sat]
+//        Rd=round(Rs,Rt)[:sat]
+//    Vector reduce add unsigned halfwords
+//        Rd=vraddh(Rss,Rtt)
+//    Vector add bytes
+//        Rdd=vaddb(Rss,Rtt)
+//    Vector conditional negate
+//        Rdd=vcnegh(Rss,Rt)
+//        Rxx+=vrcnegh(Rss,Rt)
+//    Vector maximum bytes
+//        Rdd=vmaxb(Rtt,Rss)
+//    Vector reduce maximum halfwords
+//        Rxx=vrmaxh(Rss,Ru)
+//        Rxx=vrmaxuh(Rss,Ru)
+//    Vector reduce maximum words
+//        Rxx=vrmaxuw(Rss,Ru)
+//        Rxx=vrmaxw(Rss,Ru)
+//    Vector minimum bytes
+//        Rdd=vminb(Rtt,Rss)
+//    Vector reduce minimum halfwords
+//        Rxx=vrminh(Rss,Ru)
+//        Rxx=vrminuh(Rss,Ru)
+//    Vector reduce minimum words
+//        Rxx=vrminuw(Rss,Ru)
+//        Rxx=vrminw(Rss,Ru)
+//    Vector subtract bytes
+//        Rdd=vsubb(Rss,Rtt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY +
+//===----------------------------------------------------------------------===//
+
+// Multiply and user lower result.
+// Rd=add(#u6,mpyi(Rs,#U6))
+def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
+            (ins u6Imm:$src1, IntRegs:$src2, u6Imm:$src3),
+            "$dst = add(#$src1, mpyi($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (add (mul IntRegs:$src2, u6ImmPred:$src3), u6ImmPred:$src1))]>,
+            Requires<[HasV4T]>;
+
+// Rd=add(#u6,mpyi(Rs,Rt))
+
+def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst),
+            (ins u6Imm:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst = add(#$src1, mpyi($src2, $src3))",
+            [(set IntRegs:$dst,
+             (add (mul IntRegs:$src2, IntRegs:$src3), u6ImmPred:$src1))]>,
+            Requires<[HasV4T]>;
+
+// Rd=add(Ru,mpyi(#u6:2,Rs))
+def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            "$dst = add($src1, mpyi(#$src2, $src3))",
+            [(set IntRegs:$dst,
+             (add IntRegs:$src1, (mul IntRegs:$src3, u6_2ImmPred:$src2)))]>,
+            Requires<[HasV4T]>;
+
+// Rd=add(Ru,mpyi(Rs,#u6))
+def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u6Imm:$src3),
+            "$dst = add($src1, mpyi($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (add IntRegs:$src1, (mul IntRegs:$src2, u6ImmPred:$src3)))]>,
+            Requires<[HasV4T]>;
+
+// Rx=add(Ru,mpyi(Rx,Rs))
+def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst = add($src1, mpyi($src2, $src3))",
+            [(set IntRegs:$dst,
+             (add IntRegs:$src1, (mul IntRegs:$src2, IntRegs:$src3)))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+// Rxx^=pmpyw(Rs,Rt)
+
+// Vector reduce multiply word by signed half (32x16)
+// Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+// Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+// Rxx+=vrmpyweh(Rss,Rtt)[:<<1]
+// Rxx+=vrmpywoh(Rss,Rtt)[:<<1]
+
+// Multiply and use upper result
+// Rd=mpy(Rs,Rt.H):<<1:sat
+// Rd=mpy(Rs,Rt.L):<<1:sat
+// Rd=mpy(Rs,Rt):<<1
+// Rd=mpy(Rs,Rt):<<1:sat
+// Rd=mpysu(Rs,Rt)
+// Rx+=mpy(Rs,Rt):<<1:sat
+// Rx-=mpy(Rs,Rt):<<1:sat
+
+// Vector multiply bytes
+// Rdd=vmpybsu(Rs,Rt)
+// Rdd=vmpybu(Rs,Rt)
+// Rxx+=vmpybsu(Rs,Rt)
+// Rxx+=vmpybu(Rs,Rt)
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+// Rxx^=vpmpyh(Rs,Rt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+
+// Shift by immediate and accumulate.
+// Rx=add(#u8,asl(Rx,#U5))
+def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = add(#$src1, asl($src2, #$src3))",
+            [(set IntRegs:$dst,
+              (add (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=add(#u8,lsr(Rx,#U5))
+def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = add(#$src1, lsr($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (add (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=sub(#u8,asl(Rx,#U5))
+def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = sub(#$src1, asl($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (sub (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=sub(#u8,lsr(Rx,#U5))
+def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = sub(#$src1, lsr($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (sub (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//Shift by immediate and logical.
+//Rx=and(#u8,asl(Rx,#U5))
+def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = and(#$src1, asl($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (and (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=and(#u8,lsr(Rx,#U5))
+def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = and(#$src1, lsr($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (and (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=or(#u8,asl(Rx,#U5))
+def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = or(#$src1, asl($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (or (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=or(#u8,lsr(Rx,#U5))
+def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = or(#$src1, lsr($src2, #$src3))",
+            [(set IntRegs:$dst,
+             (or (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//Shift by register.
+//Rd=lsl(#s6,Rt)
+def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
+            "$dst = lsl(#$src1, $src2)",
+            [(set IntRegs:$dst, (shl s6ImmPred:$src1, IntRegs:$src2))]>,
+            Requires<[HasV4T]>;
+
+
+//Shift by register and logical.
+//Rxx^=asl(Rss,Rt)
+def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= asl($src2, $src3)",
+            [(set DoubleRegs:$dst,
+             (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=asr(Rss,Rt)
+def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= asr($src2, $src3)",
+            [(set DoubleRegs:$dst,
+             (xor DoubleRegs:$src1, (sra DoubleRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=lsl(Rss,Rt)
+def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= lsl($src2, $src3)",
+            [(set DoubleRegs:$dst,
+             (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=lsr(Rss,Rt)
+def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= lsr($src2, $src3)",
+            [(set DoubleRegs:$dst,
+             (xor DoubleRegs:$src1, (srl DoubleRegs:$src2, IntRegs:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MEMOP: Word, Half, Byte
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MEMOP: Word
+//
+//  Implemented:
+//     MEMw_ADDi_indexed_V4  : memw(Rs+#u6:2)+=#U5
+//     MEMw_SUBi_indexed_V4  : memw(Rs+#u6:2)-=#U5
+//     MEMw_ADDr_indexed_V4  : memw(Rs+#u6:2)+=Rt
+//     MEMw_SUBr_indexed_V4  : memw(Rs+#u6:2)-=Rt
+//     MEMw_CLRr_indexed_V4  : memw(Rs+#u6:2)&=Rt
+//     MEMw_SETr_indexed_V4  : memw(Rs+#u6:2)|=Rt
+//     MEMw_ADDi_V4          : memw(Rs+#u6:2)+=#U5
+//     MEMw_SUBi_V4          : memw(Rs+#u6:2)-=#U5
+//     MEMw_ADDr_V4          : memw(Rs+#u6:2)+=Rt
+//     MEMw_SUBr_V4          : memw(Rs+#u6:2)-=Rt
+//     MEMw_CLRr_V4          : memw(Rs+#u6:2)&=Rt
+//     MEMw_SETr_V4          : memw(Rs+#u6:2)|=Rt
+//
+//   Not implemented:
+//     MEMw_CLRi_indexed_V4  : memw(Rs+#u6:2)=clrbit(#U5)
+//     MEMw_SETi_indexed_V4  : memw(Rs+#u6:2)=setbit(#U5)
+//     MEMw_CLRi_V4          : memw(Rs+#u6:2)=clrbit(#U5)
+//     MEMw_SETi_V4          : memw(Rs+#u6:2)=setbit(#U5)
+//===----------------------------------------------------------------------===//
+
+
+// MEMw_ADDSUBi_indexed_V4:
+//   pseudo operation for MEMw_ADDi_indexed_V4 and
+//   MEMw_SUBi_indexed_V4 a later pass will change it
+//   to the corresponding pattern.
+let AddedComplexity = 30 in
+def MEMw_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_2Imm:$offset, m6Imm:$addend),
+            "Error; should not emit",
+            [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)),
+m6ImmPred:$addend),
+                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) += #U5
+let AddedComplexity = 30 in
+def MEMw_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$addend),
+            "memw($base+#$offset) += $addend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) -= #U5
+let AddedComplexity = 30 in
+def MEMw_SUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$subend),
+            "memw($base+#$offset) -= $subend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) += Rt
+let AddedComplexity = 30 in
+def MEMw_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$addend),
+            "memw($base+#$offset) += $addend",
+            [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)),
+IntRegs:$addend),
+                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) -= Rt
+let AddedComplexity = 30 in
+def MEMw_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$subend),
+            "memw($base+#$offset) -= $subend",
+            [(store (sub (load (add IntRegs:$base, u6_2ImmPred:$offset)),
+IntRegs:$subend),
+                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) &= Rt
+let AddedComplexity = 30 in
+def MEMw_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$andend),
+            "memw($base+#$offset) += $andend",
+            [(store (and (load (add IntRegs:$base, u6_2ImmPred:$offset)),
+IntRegs:$andend),
+                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) |= Rt
+let AddedComplexity = 30 in
+def MEMw_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$orend),
+            "memw($base+#$offset) |= $orend",
+            [(store (or (load (add IntRegs:$base, u6_2ImmPred:$offset)),
+                        IntRegs:$orend),
+                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// MEMw_ADDSUBi_V4:
+//   Pseudo operation for MEMw_ADDi_V4 and MEMw_SUBi_V4
+//   a later pass will change it to the right pattern.
+let AddedComplexity = 30 in
+def MEMw_ADDSUBi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, m6Imm:$addend),
+            "Error; should not emit",
+            [(store (add (load ADDRriU6_2:$addr), m6ImmPred:$addend),
+                    ADDRriU6_2:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) += #U5
+let AddedComplexity = 30 in
+def MEMw_ADDi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, u5Imm:$addend),
+            "memw($addr) += $addend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) -= #U5
+let AddedComplexity = 30 in
+def MEMw_SUBi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, u5Imm:$subend),
+            "memw($addr) -= $subend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) += Rt
+let AddedComplexity = 30 in
+def MEMw_ADDr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$addend),
+            "memw($addr) += $addend",
+            [(store (add (load ADDRriU6_2:$addr), IntRegs:$addend),
+                    ADDRriU6_2:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) -= Rt
+let AddedComplexity = 30 in
+def MEMw_SUBr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$subend),
+            "memw($addr) -= $subend",
+            [(store (sub (load ADDRriU6_2:$addr), IntRegs:$subend),
+                    ADDRriU6_2:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) &= Rt
+let AddedComplexity = 30 in
+def MEMw_ANDr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$andend),
+            "memw($addr) &= $andend",
+            [(store (and (load ADDRriU6_2:$addr), IntRegs:$andend),
+                    ADDRriU6_2:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memw(Rs+#u6:2) |= Rt
+let AddedComplexity = 30 in
+def MEMw_ORr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$orend),
+            "memw($addr) |= $orend",
+            [(store (or (load ADDRriU6_2:$addr), IntRegs:$orend),
+ADDRriU6_2:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+//===----------------------------------------------------------------------===//
+// MEMOP: Halfword
+//
+//  Implemented:
+//     MEMh_ADDi_indexed_V4  : memw(Rs+#u6:2)+=#U5
+//     MEMh_SUBi_indexed_V4  : memw(Rs+#u6:2)-=#U5
+//     MEMh_ADDr_indexed_V4  : memw(Rs+#u6:2)+=Rt
+//     MEMh_SUBr_indexed_V4  : memw(Rs+#u6:2)-=Rt
+//     MEMh_CLRr_indexed_V4  : memw(Rs+#u6:2)&=Rt
+//     MEMh_SETr_indexed_V4  : memw(Rs+#u6:2)|=Rt
+//     MEMh_ADDi_V4          : memw(Rs+#u6:2)+=#U5
+//     MEMh_SUBi_V4          : memw(Rs+#u6:2)-=#U5
+//     MEMh_ADDr_V4          : memw(Rs+#u6:2)+=Rt
+//     MEMh_SUBr_V4          : memw(Rs+#u6:2)-=Rt
+//     MEMh_CLRr_V4          : memw(Rs+#u6:2)&=Rt
+//     MEMh_SETr_V4          : memw(Rs+#u6:2)|=Rt
+//
+//   Not implemented:
+//     MEMh_CLRi_indexed_V4  : memw(Rs+#u6:2)=clrbit(#U5)
+//     MEMh_SETi_indexed_V4  : memw(Rs+#u6:2)=setbit(#U5)
+//     MEMh_CLRi_V4          : memw(Rs+#u6:2)=clrbit(#U5)
+//     MEMh_SETi_V4          : memw(Rs+#u6:2)=setbit(#U5)
+//===----------------------------------------------------------------------===//
+
+
+// MEMh_ADDSUBi_indexed_V4:
+//   Pseudo operation for MEMh_ADDi_indexed_V4 and
+//   MEMh_SUBi_indexed_V4 a later pass will change it
+//   to the corresponding pattern.
+let AddedComplexity = 30 in
+def MEMh_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_1Imm:$offset, m6Imm:$addend),
+            "Error; should not emit",
+            [(truncstorei16 (add (sextloadi16 (add IntRegs:$base,
+                                                   u6_1ImmPred:$offset)),
+                                 m6ImmPred:$addend),
+                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) += #U5
+let AddedComplexity = 30 in
+def MEMh_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_1Imm:$offset, u5Imm:$addend),
+            "memh($base+#$offset) += $addend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) -= #U5
+let AddedComplexity = 30 in
+def MEMh_SUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_1Imm:$offset, u5Imm:$subend),
+            "memh($base+#$offset) -= $subend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) += Rt
+let AddedComplexity = 30 in
+def MEMh_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$addend),
+            "memh($base+#$offset) += $addend",
+            [(truncstorei16 (add (sextloadi16 (add IntRegs:$base,
+                                                   u6_1ImmPred:$offset)),
+                                 IntRegs:$addend),
+                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) -= Rt
+let AddedComplexity = 30 in
+def MEMh_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$subend),
+            "memh($base+#$offset) -= $subend",
+            [(truncstorei16 (sub (sextloadi16 (add IntRegs:$base,
+                                                   u6_1ImmPred:$offset)),
+                                 IntRegs:$subend),
+                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) &= Rt
+let AddedComplexity = 30 in
+def MEMh_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$andend),
+            "memh($base+#$offset) += $andend",
+            [(truncstorei16 (and (sextloadi16 (add IntRegs:$base,
+                                                   u6_1ImmPred:$offset)),
+                                 IntRegs:$andend),
+                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) |= Rt
+let AddedComplexity = 30 in
+def MEMh_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$orend),
+            "memh($base+#$offset) |= $orend",
+            [(truncstorei16 (or (sextloadi16 (add IntRegs:$base,
+                                              u6_1ImmPred:$offset)),
+                             IntRegs:$orend),
+                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// MEMh_ADDSUBi_V4:
+//   Pseudo operation for MEMh_ADDi_V4 and MEMh_SUBi_V4
+//   a later pass will change it to the right pattern.
+let AddedComplexity = 30 in
+def MEMh_ADDSUBi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, m6Imm:$addend),
+            "Error; should not emit",
+            [(truncstorei16 (add (sextloadi16 ADDRriU6_1:$addr),
+                                 m6ImmPred:$addend), ADDRriU6_1:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) += #U5
+let AddedComplexity = 30 in
+def MEMh_ADDi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, u5Imm:$addend),
+            "memh($addr) += $addend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) -= #U5
+let AddedComplexity = 30 in
+def MEMh_SUBi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, u5Imm:$subend),
+            "memh($addr) -= $subend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) += Rt
+let AddedComplexity = 30 in
+def MEMh_ADDr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$addend),
+            "memh($addr) += $addend",
+            [(truncstorei16 (add (sextloadi16 ADDRriU6_1:$addr),
+                                 IntRegs:$addend), ADDRriU6_1:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) -= Rt
+let AddedComplexity = 30 in
+def MEMh_SUBr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$subend),
+            "memh($addr) -= $subend",
+            [(truncstorei16 (sub (sextloadi16 ADDRriU6_1:$addr),
+                                 IntRegs:$subend), ADDRriU6_1:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) &= Rt
+let AddedComplexity = 30 in
+def MEMh_ANDr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$andend),
+            "memh($addr) &= $andend",
+            [(truncstorei16 (and (sextloadi16 ADDRriU6_1:$addr),
+                                 IntRegs:$andend), ADDRriU6_1:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memh(Rs+#u6:1) |= Rt
+let AddedComplexity = 30 in
+def MEMh_ORr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$orend),
+            "memh($addr) |= $orend",
+            [(truncstorei16 (or (sextloadi16 ADDRriU6_1:$addr),
+                                IntRegs:$orend), ADDRriU6_1:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+
+//===----------------------------------------------------------------------===//
+// MEMOP: Byte
+//
+//  Implemented:
+//     MEMb_ADDi_indexed_V4  : memb(Rs+#u6:0)+=#U5
+//     MEMb_SUBi_indexed_V4  : memb(Rs+#u6:0)-=#U5
+//     MEMb_ADDr_indexed_V4  : memb(Rs+#u6:0)+=Rt
+//     MEMb_SUBr_indexed_V4  : memb(Rs+#u6:0)-=Rt
+//     MEMb_CLRr_indexed_V4  : memb(Rs+#u6:0)&=Rt
+//     MEMb_SETr_indexed_V4  : memb(Rs+#u6:0)|=Rt
+//     MEMb_ADDi_V4          : memb(Rs+#u6:0)+=#U5
+//     MEMb_SUBi_V4          : memb(Rs+#u6:0)-=#U5
+//     MEMb_ADDr_V4          : memb(Rs+#u6:0)+=Rt
+//     MEMb_SUBr_V4          : memb(Rs+#u6:0)-=Rt
+//     MEMb_CLRr_V4          : memb(Rs+#u6:0)&=Rt
+//     MEMb_SETr_V4          : memb(Rs+#u6:0)|=Rt
+//
+//   Not implemented:
+//     MEMb_CLRi_indexed_V4  : memb(Rs+#u6:0)=clrbit(#U5)
+//     MEMb_SETi_indexed_V4  : memb(Rs+#u6:0)=setbit(#U5)
+//     MEMb_CLRi_V4          : memb(Rs+#u6:0)=clrbit(#U5)
+//     MEMb_SETi_V4          : memb(Rs+#u6:0)=setbit(#U5)
+//===----------------------------------------------------------------------===//
+
+
+// MEMb_ADDSUBi_indexed_V4:
+//   Pseudo operation for MEMb_ADDi_indexed_V4 and
+//   MEMb_SUBi_indexed_V4 a later pass will change it
+//   to the corresponding pattern.
+let AddedComplexity = 30 in
+def MEMb_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_0Imm:$offset, m6Imm:$addend),
+            "Error; should not emit",
+            [(truncstorei8 (add (sextloadi8 (add IntRegs:$base,
+                                                 u6_0ImmPred:$offset)),
+                                m6ImmPred:$addend),
+                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) += #U5
+let AddedComplexity = 30 in
+def MEMb_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_0Imm:$offset, u5Imm:$addend),
+            "memb($base+#$offset) += $addend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) -= #U5
+let AddedComplexity = 30 in
+def MEMb_SUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_0Imm:$offset, u5Imm:$subend),
+            "memb($base+#$offset) -= $subend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) += Rt
+let AddedComplexity = 30 in
+def MEMb_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$addend),
+            "memb($base+#$offset) += $addend",
+            [(truncstorei8 (add (sextloadi8 (add IntRegs:$base,
+                                                 u6_0ImmPred:$offset)),
+                                IntRegs:$addend),
+                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) -= Rt
+let AddedComplexity = 30 in
+def MEMb_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$subend),
+            "memb($base+#$offset) -= $subend",
+            [(truncstorei8 (sub (sextloadi8 (add IntRegs:$base,
+                                                 u6_0ImmPred:$offset)),
+                                IntRegs:$subend),
+                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) &= Rt
+let AddedComplexity = 30 in
+def MEMb_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$andend),
+            "memb($base+#$offset) += $andend",
+            [(truncstorei8 (and (sextloadi8 (add IntRegs:$base,
+                                                 u6_0ImmPred:$offset)),
+                                IntRegs:$andend),
+                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) |= Rt
+let AddedComplexity = 30 in
+def MEMb_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
+            (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$orend),
+            "memb($base+#$offset) |= $orend",
+            [(truncstorei8 (or (sextloadi8 (add IntRegs:$base,
+                                                u6_0ImmPred:$offset)),
+                                IntRegs:$orend),
+                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// MEMb_ADDSUBi_V4:
+//   Pseudo operation for MEMb_ADDi_V4 and MEMb_SUBi_V4
+//   a later pass will change it to the right pattern.
+let AddedComplexity = 30 in
+def MEMb_ADDSUBi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, m6Imm:$addend),
+            "Error; should not emit",
+            [(truncstorei8 (add (sextloadi8 ADDRriU6_0:$addr),
+                                m6ImmPred:$addend), ADDRriU6_0:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) += #U5
+let AddedComplexity = 30 in
+def MEMb_ADDi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, u5Imm:$addend),
+            "memb($addr) += $addend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) -= #U5
+let AddedComplexity = 30 in
+def MEMb_SUBi_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, u5Imm:$subend),
+            "memb($addr) -= $subend",
+            []>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) += Rt
+let AddedComplexity = 30 in
+def MEMb_ADDr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$addend),
+            "memb($addr) += $addend",
+            [(truncstorei8 (add (sextloadi8 ADDRriU6_0:$addr),
+                                IntRegs:$addend), ADDRriU6_0:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) -= Rt
+let AddedComplexity = 30 in
+def MEMb_SUBr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$subend),
+            "memb($addr) -= $subend",
+            [(truncstorei8 (sub (sextloadi8 ADDRriU6_0:$addr),
+                                IntRegs:$subend), ADDRriU6_0:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) &= Rt
+let AddedComplexity = 30 in
+def MEMb_ANDr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$andend),
+            "memb($addr) &= $andend",
+            [(truncstorei8 (and (sextloadi8 ADDRriU6_0:$addr),
+                                IntRegs:$andend), ADDRriU6_0:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+// memb(Rs+#u6:0) |= Rt
+let AddedComplexity = 30 in
+def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs),
+            (ins MEMri:$addr, IntRegs:$orend),
+            "memb($addr) |= $orend",
+            [(truncstorei8 (or (sextloadi8 ADDRriU6_0:$addr),
+                                IntRegs:$orend), ADDRriU6_0:$addr)]>,
+            Requires<[HasV4T, UseMEMOP]>;
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED +
+//===----------------------------------------------------------------------===//
+
+// Hexagon V4 only supports these flavors of byte/half compare instructions:
+// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
+// hardware. However, compiler can still implement these patterns through
+// appropriate patterns combinations based on current implemented patterns.
+// The implemented patterns are: EQ/GT/GTU.
+// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
+
+// Pd=cmpb.eq(Rs,#u8)
+let isCompare = 1 in
+def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u8Imm:$src2),
+            "$dst = cmpb.eq($src1, #$src2)",
+            [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 255),
+                                        u8ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.eq(Rs,Rt)
+let isCompare = 1 in
+def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.eq($src1, $src2)",
+            [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1,
+                                                  IntRegs:$src2),
+                                             255),
+                                        0))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.eq(Rs,Rt)
+let isCompare = 1 in
+def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.eq($src1, $src2)",
+            [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 24)),
+                                        (shl IntRegs:$src2, (i32 24))))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gt(Rs,#s8)
+let isCompare = 1 in
+def CMPbGTri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, s32Imm:$src2),
+            "$dst = cmpb.gt($src1, #$src2)",
+            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)),
+                                        s32_24ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gt(Rs,Rt)
+let isCompare = 1 in
+def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.gt($src1, $src2)",
+            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)),
+                                        (shl IntRegs:$src2, (i32 24))))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gtu(Rs,#u7)
+let isCompare = 1 in
+def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u7Imm:$src2),
+            "$dst = cmpb.gtu($src1, #$src2)",
+            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255),
+                                         u7ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gtu(Rs,Rt)
+let isCompare = 1 in
+def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.gtu($src1, $src2)",
+            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255),
+                                         (and IntRegs:$src2, 255)))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.eq) ri.
+// Pd=cmph.eq(Rs,#s8)
+let isCompare = 1 in
+def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u16Imm:$src2),
+            "$dst = cmph.eq($src1, #$src2)",
+            [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 65535),
+                                        u16_s8ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.eq) rr.
+// Case 1: xor + and, then compare:
+//   r0=xor(r0,r1)
+//   r0=and(r0,#0xffff)
+//   p0=cmp.eq(r0,#0)
+// Pd=cmph.eq(Rs,Rt)
+let isCompare = 1 in
+def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.eq($src1, $src2)",
+            [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1,
+                                                  IntRegs:$src2),
+                                             65535),
+                                        0))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.eq) rr.
+// Case 2: shift left 16 bits then compare:
+//   r0=asl(r0,16)
+//   r1=asl(r1,16)
+//   p0=cmp.eq(r0,r1)
+// Pd=cmph.eq(Rs,Rt)
+let isCompare = 1 in
+def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.eq($src1, $src2)",
+            [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 16)),
+                                        (shl IntRegs:$src2, (i32 16))))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.gt) ri.
+// Pd=cmph.gt(Rs,#s8)
+let isCompare = 1 in
+def CMPhGTri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, s32Imm:$src2),
+            "$dst = cmph.gt($src1, #$src2)",
+            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)),
+                                        s32_16s8ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.gt) rr.
+// Pd=cmph.gt(Rs,Rt)
+let isCompare = 1 in
+def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.gt($src1, $src2)",
+            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)),
+                                        (shl IntRegs:$src2, (i32 16))))]>,
+            Requires<[HasV4T]>;
+
+// Unsigned half compare rr (.gtu).
+// Pd=cmph.gtu(Rs,Rt)
+let isCompare = 1 in
+def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.gtu($src1, $src2)",
+            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535),
+                                         (and IntRegs:$src2, 65535)))]>,
+            Requires<[HasV4T]>;
+
+// Unsigned half compare ri (.gtu).
+// Pd=cmph.gtu(Rs,#u7)
+let isCompare = 1 in
+def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u7Imm:$src2),
+            "$dst = cmph.gtu($src1, #$src2)",
+            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535),
+                                         u7ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//Deallocate frame and return.
+//    dealloc_return
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_V4 : NVInst_V4<(outs), (ins i32imm:$amt1),
+            "dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (Ps) dealloc_return
+let isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1),
+            "if ($src1) dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps) dealloc_return
+let isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_cNotPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                     i32imm:$amt1),
+            "if (!$src1) dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_cdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                     i32imm:$amt1),
+            "if ($src1.new) dealloc_return:nt",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_cNotdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                        i32imm:$amt1),
+            "if (!$src1.new) dealloc_return:nt",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (Ps.new) dealloc_return:t
+let isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_cdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                    i32imm:$amt1),
+            "if ($src1.new) dealloc_return:t",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_cNotdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                       i32imm:$amt1),
+            "if (!$src1.new) dealloc_return:t",
+            []>,
+            Requires<[HasV4T]>;
+}
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
new file mode 100644
index 0000000..1328eba
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -0,0 +1,3462 @@
+//===- HexagonIntrinsics.td - Instruction intrinsics -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V2 Architecture
+// Application-Level Specification
+// 80-V9418-8 Rev. B
+// March 4, 2008
+//===----------------------------------------------------------------------===//
+
+//
+// ALU 32 types.
+//
+
+class qi_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_siu8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_siu9<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_qisisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                      IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_ALU32_qis8si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2,
+                                       IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_ALU32_qisis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                       s8Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_ALU32_qis8s8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2, s8Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
+
+class si_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sisi_sat<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sisi_rnd<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sis16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s16Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_s10si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins s10Imm:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
+             [(set IntRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
+
+class si_lo_ALU32_siu16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
+             !strconcat("$dst.l = ", !strconcat(opc , "#$src2")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_hi_ALU32_siu16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
+             !strconcat("$dst.h = ", !strconcat(opc , "#$src2")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_s16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins s16Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class di_ALU32_s8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class di_ALU64_di<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_ALU32_si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_ALU32_si_tfr<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+//
+// ALU 64 types.
+//
+
+class si_ALU64_si_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_ALU64_sidi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_qididi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2,
+                                          DoubleRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2,
+                                           DoubleRegs:$src3))]>;
+
+class di_ALU64_sisi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_ALU64_didi_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_rnd<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_crnd<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_rnd_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_crnd_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class qi_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_ALU64_sisi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.H, $src2.H):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.L, $src2.H):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.H, $src2.L):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.L, $src2.L):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+//
+// SInst classes.
+//
+
+class qi_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class qi_SInst_qi_pxfer<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class qi_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_qiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, !$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class di_SInst_di_sat<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_SInst_di_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class di_SInst_disi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_didi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+          !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class si_SInst_sisiu3<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u3Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                     imm:$src3))]>;
+
+class si_SInst_diu5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_disi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_sidi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_SInst_disisi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2,
+                                       IntRegs:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_siu5<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_SInst_siu6<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_si_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class di_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "$src")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "$src")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_diu6<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5_rnd<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5u5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2, u5Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
+
+class si_SInst_sisisi_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisisi_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didisi_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_SInst_didisi_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                        DoubleRegs:$src1, IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class si_SInst_sisiu5u5<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2, u5Imm:$src3),
+              !strconcat("$dst = ", !strconcat(opc ,
+                                               "($src1, #$src2, #$src3)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2, imm:$src3))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisidi<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        DoubleRegs:$src2),
+              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         DoubleRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6u6<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2, u6Imm:$src3),
+              !strconcat("$dst = ", !strconcat(opc ,
+                                               "($src1, #$src2, #$src3)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                            imm:$src2, imm:$src3))],
+              "$dst2 = $dst">;
+
+class di_SInst_dididi<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                            DoubleRegs:$src1,
+                                            DoubleRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_diu6u6<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2,
+                                       u6Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2,
+                                        imm:$src3))]>;
+
+class di_SInst_didisi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                       IntRegs:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_SInst_didiqi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                       IntRegs:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_SInst_didiu3<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                       u3Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
+                                        imm:$src3))]>;
+
+class di_SInst_didisi_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didisi_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_xor<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class si_SInst_sisisi_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisisi_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+
+class si_SInst_sisiu5_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_xor<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u5Imm:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                            DoubleRegs:$src1, imm:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u5Imm:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                            imm:$src2))],
+              "$dst2 = $dst">;
+
+
+//
+// MInst classes.
+//
+
+class di_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_disisi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc ,
+                                               "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc ,
+                                               "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc ,
+                                               "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_s8s8<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins s8Imm:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, #$src2)")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1, imm:$src2))]>;
+
+class si_MInst_sisi<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_lh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hl<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_ll<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+
+class si_MInst_sisi_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_up<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_sisi_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class si_SInst_sisi_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_l_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2.L):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_h_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2.H):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_s1_rnd_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisisi_xacc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst += ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst -= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisis8_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        s8Imm:$src3),
+             !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        imm:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisis8_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        s8Imm:$src3),
+             !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        imm:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisiu4u5<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u4Imm:$src2, u5Imm:$src3),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1, #$src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          imm:$src2, imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisiu8_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        u8Imm:$src3),
+               !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                          imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisiu8_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        u8Imm:$src3),
+               !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                          imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class si_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc ,
+                                     "($src1.H, $src2.H):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_dididi_acc_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1, DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_rnd_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):rnd:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didisi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ",
+                        !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_didi<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+
+/********************************************************************
+*            ALU32/ALU                                              *
+*********************************************************************/
+
+// ALU32 / ALU / Add.
+def Hexagon_A2_add:
+  si_ALU32_sisi                   <"add",      int_hexagon_A2_add>;
+def Hexagon_A2_addi:
+  si_ALU32_sis16                  <"add",      int_hexagon_A2_addi>;
+
+// ALU32 / ALU / Logical operations.
+def Hexagon_A2_and:
+  si_ALU32_sisi                   <"and",      int_hexagon_A2_and>;
+def Hexagon_A2_andir:
+  si_ALU32_sis10                  <"and",      int_hexagon_A2_andir>;
+def Hexagon_A2_not:
+  si_ALU32_si                     <"not",      int_hexagon_A2_not>;
+def Hexagon_A2_or:
+  si_ALU32_sisi                   <"or",       int_hexagon_A2_or>;
+def Hexagon_A2_orir:
+  si_ALU32_sis10                  <"or",       int_hexagon_A2_orir>;
+def Hexagon_A2_xor:
+  si_ALU32_sisi                   <"xor",      int_hexagon_A2_xor>;
+
+// ALU32 / ALU / Negate.
+def Hexagon_A2_neg:
+  si_ALU32_si                     <"neg",      int_hexagon_A2_neg>;
+
+// ALU32 / ALU / Subtract.
+def Hexagon_A2_sub:
+  si_ALU32_sisi                   <"sub",      int_hexagon_A2_sub>;
+def Hexagon_A2_subri:
+  si_ALU32_s10si                  <"sub",      int_hexagon_A2_subri>;
+
+// ALU32 / ALU / Transfer Immediate.
+def Hexagon_A2_tfril:
+  si_lo_ALU32_siu16               <"",         int_hexagon_A2_tfril>;
+def Hexagon_A2_tfrih:
+  si_hi_ALU32_siu16               <"",         int_hexagon_A2_tfrih>;
+def Hexagon_A2_tfrsi:
+  si_ALU32_s16                    <"",         int_hexagon_A2_tfrsi>;
+def Hexagon_A2_tfrpi:
+  di_ALU32_s8                     <"",         int_hexagon_A2_tfrpi>;
+
+// ALU32 / ALU / Transfer Register.
+def Hexagon_A2_tfr:
+  si_ALU32_si_tfr                  <"",        int_hexagon_A2_tfr>;
+
+/********************************************************************
+*            ALU32/PERM                                             *
+*********************************************************************/
+
+// ALU32 / PERM / Combine.
+def Hexagon_A2_combinew:
+  di_ALU32_sisi                   <"combine",  int_hexagon_A2_combinew>;
+def Hexagon_A2_combine_hh:
+  si_MInst_sisi_hh                <"combine",  int_hexagon_A2_combine_hh>;
+def Hexagon_A2_combine_lh:
+  si_MInst_sisi_lh                <"combine",  int_hexagon_A2_combine_lh>;
+def Hexagon_A2_combine_hl:
+  si_MInst_sisi_hl                <"combine",  int_hexagon_A2_combine_hl>;
+def Hexagon_A2_combine_ll:
+  si_MInst_sisi_ll                <"combine",  int_hexagon_A2_combine_ll>;
+def Hexagon_A2_combineii:
+  di_MInst_s8s8                   <"combine",  int_hexagon_A2_combineii>;
+
+// ALU32 / PERM / Mux.
+def Hexagon_C2_mux:
+  si_ALU32_qisisi                 <"mux",      int_hexagon_C2_mux>;
+def Hexagon_C2_muxri:
+  si_ALU32_qis8si                 <"mux",      int_hexagon_C2_muxri>;
+def Hexagon_C2_muxir:
+  si_ALU32_qisis8                 <"mux",      int_hexagon_C2_muxir>;
+def Hexagon_C2_muxii:
+  si_ALU32_qis8s8                 <"mux",      int_hexagon_C2_muxii>;
+
+// ALU32 / PERM / Shift halfword.
+def Hexagon_A2_aslh:
+  si_ALU32_si                     <"aslh",     int_hexagon_A2_aslh>;
+def Hexagon_A2_asrh:
+  si_ALU32_si                     <"asrh",     int_hexagon_A2_asrh>;
+def SI_to_SXTHI_asrh:
+  si_ALU32_si                     <"asrh",     int_hexagon_SI_to_SXTHI_asrh>;
+
+// ALU32 / PERM / Sign/zero extend.
+def Hexagon_A2_sxth:
+  si_ALU32_si                     <"sxth",     int_hexagon_A2_sxth>;
+def Hexagon_A2_sxtb:
+  si_ALU32_si                     <"sxtb",     int_hexagon_A2_sxtb>;
+def Hexagon_A2_zxth:
+  si_ALU32_si                     <"zxth",     int_hexagon_A2_zxth>;
+def Hexagon_A2_zxtb:
+  si_ALU32_si                     <"zxtb",     int_hexagon_A2_zxtb>;
+
+/********************************************************************
+*            ALU32/PRED                                             *
+*********************************************************************/
+
+// ALU32 / PRED / Compare.
+def Hexagon_C2_cmpeq:
+  qi_ALU32_sisi                   <"cmp.eq",   int_hexagon_C2_cmpeq>;
+def Hexagon_C2_cmpeqi:
+  qi_ALU32_sis10                  <"cmp.eq",   int_hexagon_C2_cmpeqi>;
+def Hexagon_C2_cmpgei:
+  qi_ALU32_sis8                   <"cmp.ge",   int_hexagon_C2_cmpgei>;
+def Hexagon_C2_cmpgeui:
+  qi_ALU32_siu8                   <"cmp.geu",  int_hexagon_C2_cmpgeui>;
+def Hexagon_C2_cmpgt:
+  qi_ALU32_sisi                   <"cmp.gt",   int_hexagon_C2_cmpgt>;
+def Hexagon_C2_cmpgti:
+  qi_ALU32_sis10                  <"cmp.gt",   int_hexagon_C2_cmpgti>;
+def Hexagon_C2_cmpgtu:
+  qi_ALU32_sisi                   <"cmp.gtu",  int_hexagon_C2_cmpgtu>;
+def Hexagon_C2_cmpgtui:
+  qi_ALU32_siu9                   <"cmp.gtu",  int_hexagon_C2_cmpgtui>;
+def Hexagon_C2_cmplt:
+  qi_ALU32_sisi                   <"cmp.lt",   int_hexagon_C2_cmplt>;
+def Hexagon_C2_cmpltu:
+  qi_ALU32_sisi                   <"cmp.ltu",  int_hexagon_C2_cmpltu>;
+
+/********************************************************************
+*            ALU32/VH                                               *
+*********************************************************************/
+
+// ALU32 / VH / Vector add halfwords.
+// Rd32=vadd[u]h(Rs32,Rt32:sat]
+def Hexagon_A2_svaddh:
+  si_ALU32_sisi                   <"vaddh",    int_hexagon_A2_svaddh>;
+def Hexagon_A2_svaddhs:
+  si_ALU32_sisi_sat               <"vaddh",    int_hexagon_A2_svaddhs>;
+def Hexagon_A2_svadduhs:
+  si_ALU32_sisi_sat               <"vadduh",   int_hexagon_A2_svadduhs>;
+
+// ALU32 / VH / Vector average halfwords.
+def Hexagon_A2_svavgh:
+  si_ALU32_sisi                   <"vavgh",    int_hexagon_A2_svavgh>;
+def Hexagon_A2_svavghs:
+  si_ALU32_sisi_rnd               <"vavgh",    int_hexagon_A2_svavghs>;
+def Hexagon_A2_svnavgh:
+  si_ALU32_sisi                   <"vnavgh",   int_hexagon_A2_svnavgh>;
+
+// ALU32 / VH / Vector subtract halfwords.
+def Hexagon_A2_svsubh:
+  si_ALU32_sisi                   <"vsubh",    int_hexagon_A2_svsubh>;
+def Hexagon_A2_svsubhs:
+  si_ALU32_sisi_sat               <"vsubh",    int_hexagon_A2_svsubhs>;
+def Hexagon_A2_svsubuhs:
+  si_ALU32_sisi_sat               <"vsubuh",   int_hexagon_A2_svsubuhs>;
+
+/********************************************************************
+*            ALU64/ALU                                              *
+*********************************************************************/
+
+// ALU64 / ALU / Add.
+def Hexagon_A2_addp:
+  di_ALU64_didi                   <"add",      int_hexagon_A2_addp>;
+def Hexagon_A2_addsat:
+  si_ALU64_sisi_sat               <"add",      int_hexagon_A2_addsat>;
+
+// ALU64 / ALU / Add halfword.
+// Even though the definition says hl, it should be lh -
+//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
+def Hexagon_A2_addh_l16_hl:
+  si_ALU64_sisi_l16_lh            <"add",      int_hexagon_A2_addh_l16_hl>;
+def Hexagon_A2_addh_l16_ll:
+  si_ALU64_sisi_l16_ll            <"add",      int_hexagon_A2_addh_l16_ll>;
+
+def Hexagon_A2_addh_l16_sat_hl:
+  si_ALU64_sisi_l16_sat_lh        <"add",      int_hexagon_A2_addh_l16_sat_hl>;
+def Hexagon_A2_addh_l16_sat_ll:
+  si_ALU64_sisi_l16_sat_ll        <"add",      int_hexagon_A2_addh_l16_sat_ll>;
+
+def Hexagon_A2_addh_h16_hh:
+  si_ALU64_sisi_h16_hh            <"add",      int_hexagon_A2_addh_h16_hh>;
+def Hexagon_A2_addh_h16_hl:
+  si_ALU64_sisi_h16_hl            <"add",      int_hexagon_A2_addh_h16_hl>;
+def Hexagon_A2_addh_h16_lh:
+  si_ALU64_sisi_h16_lh            <"add",      int_hexagon_A2_addh_h16_lh>;
+def Hexagon_A2_addh_h16_ll:
+  si_ALU64_sisi_h16_ll            <"add",      int_hexagon_A2_addh_h16_ll>;
+
+def Hexagon_A2_addh_h16_sat_hh:
+  si_ALU64_sisi_h16_sat_hh        <"add",      int_hexagon_A2_addh_h16_sat_hh>;
+def Hexagon_A2_addh_h16_sat_hl:
+  si_ALU64_sisi_h16_sat_hl        <"add",      int_hexagon_A2_addh_h16_sat_hl>;
+def Hexagon_A2_addh_h16_sat_lh:
+  si_ALU64_sisi_h16_sat_lh        <"add",      int_hexagon_A2_addh_h16_sat_lh>;
+def Hexagon_A2_addh_h16_sat_ll:
+  si_ALU64_sisi_h16_sat_ll        <"add",      int_hexagon_A2_addh_h16_sat_ll>;
+
+// ALU64 / ALU / Compare.
+def Hexagon_C2_cmpeqp:
+  qi_ALU64_didi                   <"cmp.eq",   int_hexagon_C2_cmpeqp>;
+def Hexagon_C2_cmpgtp:
+  qi_ALU64_didi                   <"cmp.gt",   int_hexagon_C2_cmpgtp>;
+def Hexagon_C2_cmpgtup:
+  qi_ALU64_didi                   <"cmp.gtu",  int_hexagon_C2_cmpgtup>;
+
+// ALU64 / ALU / Logical operations.
+def Hexagon_A2_andp:
+  di_ALU64_didi                   <"and",      int_hexagon_A2_andp>;
+def Hexagon_A2_orp:
+  di_ALU64_didi                   <"or",       int_hexagon_A2_orp>;
+def Hexagon_A2_xorp:
+  di_ALU64_didi                   <"xor",      int_hexagon_A2_xorp>;
+
+// ALU64 / ALU / Maximum.
+def Hexagon_A2_max:
+  si_ALU64_sisi                   <"max",      int_hexagon_A2_max>;
+def Hexagon_A2_maxu:
+  si_ALU64_sisi                   <"maxu",     int_hexagon_A2_maxu>;
+
+// ALU64 / ALU / Minimum.
+def Hexagon_A2_min:
+  si_ALU64_sisi                   <"min",      int_hexagon_A2_min>;
+def Hexagon_A2_minu:
+  si_ALU64_sisi                   <"minu",     int_hexagon_A2_minu>;
+
+// ALU64 / ALU / Subtract.
+def Hexagon_A2_subp:
+  di_ALU64_didi                   <"sub",      int_hexagon_A2_subp>;
+def Hexagon_A2_subsat:
+  si_ALU64_sisi_sat               <"sub",      int_hexagon_A2_subsat>;
+
+// ALU64 / ALU / Subtract halfword.
+// Even though the definition says hl, it should be lh -
+//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
+def Hexagon_A2_subh_l16_hl:
+  si_ALU64_sisi_l16_lh            <"sub",      int_hexagon_A2_subh_l16_hl>;
+def Hexagon_A2_subh_l16_ll:
+  si_ALU64_sisi_l16_ll            <"sub",      int_hexagon_A2_subh_l16_ll>;
+
+def Hexagon_A2_subh_l16_sat_hl:
+  si_ALU64_sisi_l16_sat_lh        <"sub",      int_hexagon_A2_subh_l16_sat_hl>;
+def Hexagon_A2_subh_l16_sat_ll:
+  si_ALU64_sisi_l16_sat_ll        <"sub",      int_hexagon_A2_subh_l16_sat_ll>;
+
+def Hexagon_A2_subh_h16_hh:
+  si_ALU64_sisi_h16_hh            <"sub",      int_hexagon_A2_subh_h16_hh>;
+def Hexagon_A2_subh_h16_hl:
+  si_ALU64_sisi_h16_hl            <"sub",      int_hexagon_A2_subh_h16_hl>;
+def Hexagon_A2_subh_h16_lh:
+  si_ALU64_sisi_h16_lh            <"sub",      int_hexagon_A2_subh_h16_lh>;
+def Hexagon_A2_subh_h16_ll:
+  si_ALU64_sisi_h16_ll            <"sub",      int_hexagon_A2_subh_h16_ll>;
+
+def Hexagon_A2_subh_h16_sat_hh:
+  si_ALU64_sisi_h16_sat_hh        <"sub",      int_hexagon_A2_subh_h16_sat_hh>;
+def Hexagon_A2_subh_h16_sat_hl:
+  si_ALU64_sisi_h16_sat_hl        <"sub",      int_hexagon_A2_subh_h16_sat_hl>;
+def Hexagon_A2_subh_h16_sat_lh:
+  si_ALU64_sisi_h16_sat_lh        <"sub",      int_hexagon_A2_subh_h16_sat_lh>;
+def Hexagon_A2_subh_h16_sat_ll:
+  si_ALU64_sisi_h16_sat_ll        <"sub",      int_hexagon_A2_subh_h16_sat_ll>;
+
+// ALU64 / ALU / Transfer register.
+def Hexagon_A2_tfrp:
+  di_ALU64_di                     <"",         int_hexagon_A2_tfrp>;
+
+/********************************************************************
+*            ALU64/BIT                                              *
+*********************************************************************/
+
+// ALU64 / BIT / Masked parity.
+def Hexagon_S2_parityp:
+  si_ALU64_didi                   <"parity",   int_hexagon_S2_parityp>;
+
+/********************************************************************
+*            ALU64/PERM                                             *
+*********************************************************************/
+
+// ALU64 / PERM / Vector pack high and low halfwords.
+def Hexagon_S2_packhl:
+  di_ALU64_sisi                   <"packhl",   int_hexagon_S2_packhl>;
+
+/********************************************************************
+*            ALU64/VB                                               *
+*********************************************************************/
+
+// ALU64 / VB / Vector add unsigned bytes.
+def Hexagon_A2_vaddub:
+  di_ALU64_didi                   <"vaddub",   int_hexagon_A2_vaddub>;
+def Hexagon_A2_vaddubs:
+  di_ALU64_didi_sat               <"vaddub",   int_hexagon_A2_vaddubs>;
+
+// ALU64 / VB / Vector average unsigned bytes.
+def Hexagon_A2_vavgub:
+  di_ALU64_didi                   <"vavgub",   int_hexagon_A2_vavgub>;
+def Hexagon_A2_vavgubr:
+  di_ALU64_didi_rnd               <"vavgub",   int_hexagon_A2_vavgubr>;
+
+// ALU64 / VB / Vector compare unsigned bytes.
+def Hexagon_A2_vcmpbeq:
+  qi_ALU64_didi                   <"vcmpb.eq", int_hexagon_A2_vcmpbeq>;
+def Hexagon_A2_vcmpbgtu:
+  qi_ALU64_didi                   <"vcmpb.gtu",int_hexagon_A2_vcmpbgtu>;
+
+// ALU64 / VB / Vector maximum/minimum unsigned bytes.
+def Hexagon_A2_vmaxub:
+  di_ALU64_didi                   <"vmaxub",   int_hexagon_A2_vmaxub>;
+def Hexagon_A2_vminub:
+  di_ALU64_didi                   <"vminub",   int_hexagon_A2_vminub>;
+
+// ALU64 / VB / Vector subtract unsigned bytes.
+def Hexagon_A2_vsubub:
+  di_ALU64_didi                   <"vsubub",   int_hexagon_A2_vsubub>;
+def Hexagon_A2_vsububs:
+  di_ALU64_didi_sat               <"vsubub",   int_hexagon_A2_vsububs>;
+
+// ALU64 / VB / Vector mux.
+def Hexagon_C2_vmux:
+  di_ALU64_qididi                 <"vmux",     int_hexagon_C2_vmux>;
+
+
+/********************************************************************
+*            ALU64/VH                                               *
+*********************************************************************/
+
+// ALU64 / VH / Vector add halfwords.
+// Rdd64=vadd[u]h(Rss64,Rtt64:sat]
+def Hexagon_A2_vaddh:
+  di_ALU64_didi                   <"vaddh",    int_hexagon_A2_vaddh>;
+def Hexagon_A2_vaddhs:
+  di_ALU64_didi_sat               <"vaddh",    int_hexagon_A2_vaddhs>;
+def Hexagon_A2_vadduhs:
+  di_ALU64_didi_sat               <"vadduh",   int_hexagon_A2_vadduhs>;
+
+// ALU64 / VH / Vector average halfwords.
+// Rdd64=v[n]avg[u]h(Rss64,Rtt64:rnd/:crnd][:sat]
+def Hexagon_A2_vavgh:
+  di_ALU64_didi                   <"vavgh",    int_hexagon_A2_vavgh>;
+def Hexagon_A2_vavghcr:
+  di_ALU64_didi_crnd              <"vavgh",    int_hexagon_A2_vavghcr>;
+def Hexagon_A2_vavghr:
+  di_ALU64_didi_rnd               <"vavgh",    int_hexagon_A2_vavghr>;
+def Hexagon_A2_vavguh:
+  di_ALU64_didi                   <"vavguh",   int_hexagon_A2_vavguh>;
+def Hexagon_A2_vavguhr:
+  di_ALU64_didi_rnd               <"vavguh",   int_hexagon_A2_vavguhr>;
+def Hexagon_A2_vnavgh:
+  di_ALU64_didi                   <"vnavgh",   int_hexagon_A2_vnavgh>;
+def Hexagon_A2_vnavghcr:
+  di_ALU64_didi_crnd_sat          <"vnavgh",   int_hexagon_A2_vnavghcr>;
+def Hexagon_A2_vnavghr:
+  di_ALU64_didi_rnd_sat           <"vnavgh",   int_hexagon_A2_vnavghr>;
+
+// ALU64 / VH / Vector compare halfwords.
+def Hexagon_A2_vcmpheq:
+  qi_ALU64_didi                   <"vcmph.eq", int_hexagon_A2_vcmpheq>;
+def Hexagon_A2_vcmphgt:
+  qi_ALU64_didi                   <"vcmph.gt", int_hexagon_A2_vcmphgt>;
+def Hexagon_A2_vcmphgtu:
+  qi_ALU64_didi                   <"vcmph.gtu",int_hexagon_A2_vcmphgtu>;
+
+// ALU64 / VH / Vector maximum halfwords.
+def Hexagon_A2_vmaxh:
+  di_ALU64_didi                   <"vmaxh",    int_hexagon_A2_vmaxh>;
+def Hexagon_A2_vmaxuh:
+  di_ALU64_didi                   <"vmaxuh",   int_hexagon_A2_vmaxuh>;
+
+// ALU64 / VH / Vector minimum halfwords.
+def Hexagon_A2_vminh:
+  di_ALU64_didi                   <"vminh",    int_hexagon_A2_vminh>;
+def Hexagon_A2_vminuh:
+  di_ALU64_didi                   <"vminuh",   int_hexagon_A2_vminuh>;
+
+// ALU64 / VH / Vector subtract halfwords.
+def Hexagon_A2_vsubh:
+  di_ALU64_didi                   <"vsubh",    int_hexagon_A2_vsubh>;
+def Hexagon_A2_vsubhs:
+  di_ALU64_didi_sat               <"vsubh",    int_hexagon_A2_vsubhs>;
+def Hexagon_A2_vsubuhs:
+  di_ALU64_didi_sat               <"vsubuh",   int_hexagon_A2_vsubuhs>;
+
+
+/********************************************************************
+*            ALU64/VW                                               *
+*********************************************************************/
+
+// ALU64 / VW / Vector add words.
+// Rdd32=vaddw(Rss32,Rtt32)[:sat]
+def Hexagon_A2_vaddw:
+  di_ALU64_didi                   <"vaddw",    int_hexagon_A2_vaddw>;
+def Hexagon_A2_vaddws:
+  di_ALU64_didi_sat               <"vaddw",   int_hexagon_A2_vaddws>;
+
+// ALU64 / VW / Vector average words.
+def Hexagon_A2_vavguw:
+  di_ALU64_didi                   <"vavguw",   int_hexagon_A2_vavguw>;
+def Hexagon_A2_vavguwr:
+  di_ALU64_didi_rnd               <"vavguw",   int_hexagon_A2_vavguwr>;
+def Hexagon_A2_vavgw:
+  di_ALU64_didi                   <"vavgw",    int_hexagon_A2_vavgw>;
+def Hexagon_A2_vavgwcr:
+  di_ALU64_didi_crnd              <"vavgw",    int_hexagon_A2_vavgwcr>;
+def Hexagon_A2_vavgwr:
+  di_ALU64_didi_rnd               <"vavgw",    int_hexagon_A2_vavgwr>;
+def Hexagon_A2_vnavgw:
+  di_ALU64_didi                   <"vnavgw",   int_hexagon_A2_vnavgw>;
+def Hexagon_A2_vnavgwcr:
+  di_ALU64_didi_crnd_sat          <"vnavgw",   int_hexagon_A2_vnavgwcr>;
+def Hexagon_A2_vnavgwr:
+  di_ALU64_didi_rnd_sat           <"vnavgw",   int_hexagon_A2_vnavgwr>;
+
+// ALU64 / VW / Vector compare words.
+def Hexagon_A2_vcmpweq:
+  qi_ALU64_didi                   <"vcmpw.eq", int_hexagon_A2_vcmpweq>;
+def Hexagon_A2_vcmpwgt:
+  qi_ALU64_didi                   <"vcmpw.gt", int_hexagon_A2_vcmpwgt>;
+def Hexagon_A2_vcmpwgtu:
+  qi_ALU64_didi                   <"vcmpw.gtu",int_hexagon_A2_vcmpwgtu>;
+
+// ALU64 / VW / Vector maximum words.
+def Hexagon_A2_vmaxw:
+  di_ALU64_didi                   <"vmaxw",    int_hexagon_A2_vmaxw>;
+def Hexagon_A2_vmaxuw:
+  di_ALU64_didi                   <"vmaxuw",   int_hexagon_A2_vmaxuw>;
+
+// ALU64 / VW / Vector minimum words.
+def Hexagon_A2_vminw:
+  di_ALU64_didi                   <"vminw",    int_hexagon_A2_vminw>;
+def Hexagon_A2_vminuw:
+  di_ALU64_didi                   <"vminuw",   int_hexagon_A2_vminuw>;
+
+// ALU64 / VW / Vector subtract words.
+def Hexagon_A2_vsubw:
+  di_ALU64_didi                   <"vsubw",    int_hexagon_A2_vsubw>;
+def Hexagon_A2_vsubws:
+  di_ALU64_didi_sat               <"vsubw",    int_hexagon_A2_vsubws>;
+
+
+/********************************************************************
+*            CR                                                     *
+*********************************************************************/
+
+// CR / Logical reductions on predicates.
+def Hexagon_C2_all8:
+  qi_SInst_qi                     <"all8",     int_hexagon_C2_all8>;
+def Hexagon_C2_any8:
+  qi_SInst_qi                     <"any8",     int_hexagon_C2_any8>;
+
+// CR / Logical operations on predicates.
+def Hexagon_C2_pxfer_map:
+  qi_SInst_qi_pxfer               <"",         int_hexagon_C2_pxfer_map>;
+def Hexagon_C2_and:
+  qi_SInst_qiqi                   <"and",      int_hexagon_C2_and>;
+def Hexagon_C2_andn:
+  qi_SInst_qiqi_neg               <"and",      int_hexagon_C2_andn>;
+def Hexagon_C2_not:
+  qi_SInst_qi                     <"not",      int_hexagon_C2_not>;
+def Hexagon_C2_or:
+  qi_SInst_qiqi                   <"or",       int_hexagon_C2_or>;
+def Hexagon_C2_orn:
+  qi_SInst_qiqi_neg               <"or",       int_hexagon_C2_orn>;
+def Hexagon_C2_xor:
+  qi_SInst_qiqi                   <"xor",      int_hexagon_C2_xor>;
+
+
+/********************************************************************
+*            MTYPE/ALU                                              *
+*********************************************************************/
+
+// MTYPE / ALU / Add and accumulate.
+def Hexagon_M2_acci:
+  si_MInst_sisisi_acc             <"add",      int_hexagon_M2_acci>;
+def Hexagon_M2_accii:
+  si_MInst_sisis8_acc             <"add",      int_hexagon_M2_accii>;
+def Hexagon_M2_nacci:
+  si_MInst_sisisi_nac             <"add",      int_hexagon_M2_nacci>;
+def Hexagon_M2_naccii:
+  si_MInst_sisis8_nac             <"add",      int_hexagon_M2_naccii>;
+
+// MTYPE / ALU / Subtract and accumulate.
+def Hexagon_M2_subacc:
+  si_MInst_sisisi_acc             <"sub",      int_hexagon_M2_subacc>;
+
+// MTYPE / ALU / Vector absolute difference.
+def Hexagon_M2_vabsdiffh:
+  di_MInst_didi                   <"vabsdiffh",int_hexagon_M2_vabsdiffh>;
+def Hexagon_M2_vabsdiffw:
+  di_MInst_didi                   <"vabsdiffw",int_hexagon_M2_vabsdiffw>;
+
+// MTYPE / ALU / XOR and xor with destination.
+def Hexagon_M2_xor_xacc:
+  si_MInst_sisisi_xacc            <"xor",      int_hexagon_M2_xor_xacc>;
+
+
+/********************************************************************
+*            MTYPE/COMPLEX                                          *
+*********************************************************************/
+
+// MTYPE / COMPLEX / Complex multiply.
+// Rdd[-+]=cmpy(Rs, Rt:<<1]:sat
+def Hexagon_M2_cmpys_s1:
+  di_MInst_sisi_s1_sat            <"cmpy",     int_hexagon_M2_cmpys_s1>;
+def Hexagon_M2_cmpys_s0:
+  di_MInst_sisi_sat               <"cmpy",     int_hexagon_M2_cmpys_s0>;
+def Hexagon_M2_cmpysc_s1:
+  di_MInst_sisi_s1_sat_conj       <"cmpy",     int_hexagon_M2_cmpysc_s1>;
+def Hexagon_M2_cmpysc_s0:
+  di_MInst_sisi_sat_conj          <"cmpy",     int_hexagon_M2_cmpysc_s0>;
+
+def Hexagon_M2_cmacs_s1:
+  di_MInst_disisi_acc_s1_sat      <"cmpy",     int_hexagon_M2_cmacs_s1>;
+def Hexagon_M2_cmacs_s0:
+  di_MInst_disisi_acc_sat         <"cmpy",     int_hexagon_M2_cmacs_s0>;
+def Hexagon_M2_cmacsc_s1:
+  di_MInst_disisi_acc_s1_sat_conj <"cmpy",     int_hexagon_M2_cmacsc_s1>;
+def Hexagon_M2_cmacsc_s0:
+  di_MInst_disisi_acc_sat_conj    <"cmpy",     int_hexagon_M2_cmacsc_s0>;
+
+def Hexagon_M2_cnacs_s1:
+  di_MInst_disisi_nac_s1_sat      <"cmpy",     int_hexagon_M2_cnacs_s1>;
+def Hexagon_M2_cnacs_s0:
+  di_MInst_disisi_nac_sat         <"cmpy",     int_hexagon_M2_cnacs_s0>;
+def Hexagon_M2_cnacsc_s1:
+  di_MInst_disisi_nac_s1_sat_conj <"cmpy",     int_hexagon_M2_cnacsc_s1>;
+def Hexagon_M2_cnacsc_s0:
+  di_MInst_disisi_nac_sat_conj    <"cmpy",     int_hexagon_M2_cnacsc_s0>;
+
+// MTYPE / COMPLEX / Complex multiply real or imaginary.
+def Hexagon_M2_cmpyr_s0:
+  di_MInst_sisi                   <"cmpyr",    int_hexagon_M2_cmpyr_s0>;
+def Hexagon_M2_cmacr_s0:
+  di_MInst_disisi_acc             <"cmpyr",    int_hexagon_M2_cmacr_s0>;
+
+def Hexagon_M2_cmpyi_s0:
+  di_MInst_sisi                   <"cmpyi",    int_hexagon_M2_cmpyi_s0>;
+def Hexagon_M2_cmaci_s0:
+  di_MInst_disisi_acc             <"cmpyi",    int_hexagon_M2_cmaci_s0>;
+
+// MTYPE / COMPLEX / Complex multiply with round and pack.
+// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
+def Hexagon_M2_cmpyrs_s0:
+  si_MInst_sisi_rnd_sat           <"cmpy",     int_hexagon_M2_cmpyrs_s0>;
+def Hexagon_M2_cmpyrs_s1:
+  si_MInst_sisi_s1_rnd_sat        <"cmpy",     int_hexagon_M2_cmpyrs_s1>;
+
+def Hexagon_M2_cmpyrsc_s0:
+  si_MInst_sisi_rnd_sat_conj      <"cmpy",     int_hexagon_M2_cmpyrsc_s0>;
+def Hexagon_M2_cmpyrsc_s1:
+  si_MInst_sisi_s1_rnd_sat_conj   <"cmpy",     int_hexagon_M2_cmpyrsc_s1>;
+
+//MTYPE / COMPLEX / Vector complex multiply real or imaginary.
+def Hexagon_M2_vcmpy_s0_sat_i:
+  di_MInst_didi_sat               <"vcmpyi",   int_hexagon_M2_vcmpy_s0_sat_i>;
+def Hexagon_M2_vcmpy_s1_sat_i:
+  di_MInst_didi_s1_sat            <"vcmpyi",   int_hexagon_M2_vcmpy_s1_sat_i>;
+
+def Hexagon_M2_vcmpy_s0_sat_r:
+  di_MInst_didi_sat               <"vcmpyr",   int_hexagon_M2_vcmpy_s0_sat_r>;
+def Hexagon_M2_vcmpy_s1_sat_r:
+  di_MInst_didi_s1_sat            <"vcmpyr",   int_hexagon_M2_vcmpy_s1_sat_r>;
+
+def Hexagon_M2_vcmac_s0_sat_i:
+  di_MInst_dididi_acc_sat         <"vcmpyi",   int_hexagon_M2_vcmac_s0_sat_i>;
+def Hexagon_M2_vcmac_s0_sat_r:
+  di_MInst_dididi_acc_sat         <"vcmpyr",   int_hexagon_M2_vcmac_s0_sat_r>;
+
+//MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
+def Hexagon_M2_vrcmpyi_s0:
+  di_MInst_didi                   <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0>;
+def Hexagon_M2_vrcmpyr_s0:
+  di_MInst_didi                   <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0>;
+
+def Hexagon_M2_vrcmpyi_s0c:
+  di_MInst_didi_conj              <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0c>;
+def Hexagon_M2_vrcmpyr_s0c:
+  di_MInst_didi_conj              <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0c>;
+
+def Hexagon_M2_vrcmaci_s0:
+  di_MInst_dididi_acc             <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0>;
+def Hexagon_M2_vrcmacr_s0:
+  di_MInst_dididi_acc             <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0>;
+
+def Hexagon_M2_vrcmaci_s0c:
+  di_MInst_dididi_acc_conj        <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0c>;
+def Hexagon_M2_vrcmacr_s0c:
+  di_MInst_dididi_acc_conj        <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0c>;
+
+
+/********************************************************************
+*            MTYPE/MPYH                                             *
+*********************************************************************/
+
+// MTYPE / MPYH / Multiply and use lower result.
+//def Hexagon_M2_mpysmi:
+//  si_MInst_sim9                   <"mpyi",     int_hexagon_M2_mpysmi>;
+def Hexagon_M2_mpyi:
+  si_MInst_sisi                   <"mpyi",     int_hexagon_M2_mpyi>;
+def Hexagon_M2_mpyui:
+  si_MInst_sisi                   <"mpyui",    int_hexagon_M2_mpyui>;
+def Hexagon_M2_macsip:
+  si_MInst_sisiu8_acc             <"mpyi",     int_hexagon_M2_macsip>;
+def Hexagon_M2_maci:
+  si_MInst_sisisi_acc             <"mpyi",     int_hexagon_M2_maci>;
+def Hexagon_M2_macsin:
+  si_MInst_sisiu8_nac             <"mpyi",     int_hexagon_M2_macsin>;
+
+// MTYPE / MPYH / Multiply word by half (32x16).
+//Rdd[+]=vmpywoh(Rss,Rtt)[:<<1][:rnd][:sat]
+//Rdd[+]=vmpyweh(Rss,Rtt)[:<<1][:rnd][:sat]
+def Hexagon_M2_mmpyl_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpyweh",  int_hexagon_M2_mmpyl_rs1>;
+def Hexagon_M2_mmpyl_s1:
+  di_MInst_didi_s1_sat            <"vmpyweh",  int_hexagon_M2_mmpyl_s1>;
+def Hexagon_M2_mmpyl_rs0:
+  di_MInst_didi_rnd_sat           <"vmpyweh",  int_hexagon_M2_mmpyl_rs0>;
+def Hexagon_M2_mmpyl_s0:
+  di_MInst_didi_sat               <"vmpyweh",  int_hexagon_M2_mmpyl_s0>;
+def Hexagon_M2_mmpyh_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpywoh",  int_hexagon_M2_mmpyh_rs1>;
+def Hexagon_M2_mmpyh_s1:
+  di_MInst_didi_s1_sat            <"vmpywoh",  int_hexagon_M2_mmpyh_s1>;
+def Hexagon_M2_mmpyh_rs0:
+  di_MInst_didi_rnd_sat           <"vmpywoh",  int_hexagon_M2_mmpyh_rs0>;
+def Hexagon_M2_mmpyh_s0:
+  di_MInst_didi_sat               <"vmpywoh",  int_hexagon_M2_mmpyh_s0>;
+def Hexagon_M2_mmacls_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweh",  int_hexagon_M2_mmacls_rs1>;
+def Hexagon_M2_mmacls_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyweh",  int_hexagon_M2_mmacls_s1>;
+def Hexagon_M2_mmacls_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpyweh",  int_hexagon_M2_mmacls_rs0>;
+def Hexagon_M2_mmacls_s0:
+  di_MInst_dididi_acc_sat         <"vmpyweh",  int_hexagon_M2_mmacls_s0>;
+def Hexagon_M2_mmachs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywoh",  int_hexagon_M2_mmachs_rs1>;
+def Hexagon_M2_mmachs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpywoh",  int_hexagon_M2_mmachs_s1>;
+def Hexagon_M2_mmachs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpywoh",  int_hexagon_M2_mmachs_rs0>;
+def Hexagon_M2_mmachs_s0:
+  di_MInst_dididi_acc_sat         <"vmpywoh",  int_hexagon_M2_mmachs_s0>;
+
+// MTYPE / MPYH / Multiply word by unsigned half (32x16).
+//Rdd[+]=vmpywouh(Rss,Rtt)[:<<1][:rnd][:sat]
+//Rdd[+]=vmpyweuh(Rss,Rtt)[:<<1][:rnd][:sat]
+def Hexagon_M2_mmpyul_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpyweuh", int_hexagon_M2_mmpyul_rs1>;
+def Hexagon_M2_mmpyul_s1:
+  di_MInst_didi_s1_sat            <"vmpyweuh", int_hexagon_M2_mmpyul_s1>;
+def Hexagon_M2_mmpyul_rs0:
+  di_MInst_didi_rnd_sat           <"vmpyweuh", int_hexagon_M2_mmpyul_rs0>;
+def Hexagon_M2_mmpyul_s0:
+  di_MInst_didi_sat               <"vmpyweuh", int_hexagon_M2_mmpyul_s0>;
+def Hexagon_M2_mmpyuh_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpywouh", int_hexagon_M2_mmpyuh_rs1>;
+def Hexagon_M2_mmpyuh_s1:
+  di_MInst_didi_s1_sat            <"vmpywouh", int_hexagon_M2_mmpyuh_s1>;
+def Hexagon_M2_mmpyuh_rs0:
+  di_MInst_didi_rnd_sat           <"vmpywouh", int_hexagon_M2_mmpyuh_rs0>;
+def Hexagon_M2_mmpyuh_s0:
+  di_MInst_didi_sat               <"vmpywouh", int_hexagon_M2_mmpyuh_s0>;
+def Hexagon_M2_mmaculs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweuh", int_hexagon_M2_mmaculs_rs1>;
+def Hexagon_M2_mmaculs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyweuh", int_hexagon_M2_mmaculs_s1>;
+def Hexagon_M2_mmaculs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpyweuh", int_hexagon_M2_mmaculs_rs0>;
+def Hexagon_M2_mmaculs_s0:
+  di_MInst_dididi_acc_sat         <"vmpyweuh", int_hexagon_M2_mmaculs_s0>;
+def Hexagon_M2_mmacuhs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywouh", int_hexagon_M2_mmacuhs_rs1>;
+def Hexagon_M2_mmacuhs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpywouh", int_hexagon_M2_mmacuhs_s1>;
+def Hexagon_M2_mmacuhs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpywouh", int_hexagon_M2_mmacuhs_rs0>;
+def Hexagon_M2_mmacuhs_s0:
+  di_MInst_dididi_acc_sat         <"vmpywouh", int_hexagon_M2_mmacuhs_s0>;
+
+// MTYPE / MPYH / Multiply and use upper result.
+def Hexagon_M2_hmmpyh_rs1:
+  si_MInst_sisi_h_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyh_rs1>;
+def Hexagon_M2_hmmpyl_rs1:
+  si_MInst_sisi_l_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyl_rs1>;
+def Hexagon_M2_mpy_up:
+  si_MInst_sisi                   <"mpy",      int_hexagon_M2_mpy_up>;
+def Hexagon_M2_dpmpyss_rnd_s0:
+  si_MInst_sisi_rnd               <"mpy",      int_hexagon_M2_dpmpyss_rnd_s0>;
+def Hexagon_M2_mpyu_up:
+  si_MInst_sisi                   <"mpyu",     int_hexagon_M2_mpyu_up>;
+
+// MTYPE / MPYH / Multiply and use full result.
+def Hexagon_M2_dpmpyuu_s0:
+  di_MInst_sisi                   <"mpyu",     int_hexagon_M2_dpmpyuu_s0>;
+def Hexagon_M2_dpmpyuu_acc_s0:
+  di_MInst_disisi_acc             <"mpyu",     int_hexagon_M2_dpmpyuu_acc_s0>;
+def Hexagon_M2_dpmpyuu_nac_s0:
+  di_MInst_disisi_nac             <"mpyu",     int_hexagon_M2_dpmpyuu_nac_s0>;
+def Hexagon_M2_dpmpyss_s0:
+  di_MInst_sisi                   <"mpy",      int_hexagon_M2_dpmpyss_s0>;
+def Hexagon_M2_dpmpyss_acc_s0:
+  di_MInst_disisi_acc             <"mpy",      int_hexagon_M2_dpmpyss_acc_s0>;
+def Hexagon_M2_dpmpyss_nac_s0:
+  di_MInst_disisi_nac             <"mpy",      int_hexagon_M2_dpmpyss_nac_s0>;
+
+
+/********************************************************************
+*            MTYPE/MPYS                                             *
+*********************************************************************/
+
+// MTYPE / MPYS / Scalar 16x16 multiply signed.
+//Rd=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]|
+//          [:<<0[:rnd|:sat|:rnd:sat]|:<<1[:rnd|:sat|:rnd:sat]]]
+def Hexagon_M2_mpy_hh_s0:
+  si_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpy_hh_s0>;
+def Hexagon_M2_mpy_hh_s1:
+  si_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpy_hh_s1>;
+def Hexagon_M2_mpy_rnd_hh_s1:
+  si_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hh_s1>;
+def Hexagon_M2_mpy_sat_rnd_hh_s1:
+  si_MInst_sisi_sat_rnd_hh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s1>;
+def Hexagon_M2_mpy_sat_hh_s1:
+  si_MInst_sisi_sat_hh_s1         <"mpy",     int_hexagon_M2_mpy_sat_hh_s1>;
+def Hexagon_M2_mpy_rnd_hh_s0:
+  si_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpy_rnd_hh_s0>;
+def Hexagon_M2_mpy_sat_rnd_hh_s0:
+  si_MInst_sisi_sat_rnd_hh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s0>;
+def Hexagon_M2_mpy_sat_hh_s0:
+  si_MInst_sisi_sat_hh            <"mpy",     int_hexagon_M2_mpy_sat_hh_s0>;
+
+def Hexagon_M2_mpy_hl_s0:
+  si_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpy_hl_s0>;
+def Hexagon_M2_mpy_hl_s1:
+  si_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpy_hl_s1>;
+def Hexagon_M2_mpy_rnd_hl_s1:
+  si_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hl_s1>;
+def Hexagon_M2_mpy_sat_rnd_hl_s1:
+  si_MInst_sisi_sat_rnd_hl_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s1>;
+def Hexagon_M2_mpy_sat_hl_s1:
+  si_MInst_sisi_sat_hl_s1         <"mpy",     int_hexagon_M2_mpy_sat_hl_s1>;
+def Hexagon_M2_mpy_rnd_hl_s0:
+  si_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpy_rnd_hl_s0>;
+def Hexagon_M2_mpy_sat_rnd_hl_s0:
+  si_MInst_sisi_sat_rnd_hl        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s0>;
+def Hexagon_M2_mpy_sat_hl_s0:
+  si_MInst_sisi_sat_hl            <"mpy",     int_hexagon_M2_mpy_sat_hl_s0>;
+
+def Hexagon_M2_mpy_lh_s0:
+  si_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpy_lh_s0>;
+def Hexagon_M2_mpy_lh_s1:
+  si_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpy_lh_s1>;
+def Hexagon_M2_mpy_rnd_lh_s1:
+  si_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_lh_s1>;
+def Hexagon_M2_mpy_sat_rnd_lh_s1:
+  si_MInst_sisi_sat_rnd_lh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s1>;
+def Hexagon_M2_mpy_sat_lh_s1:
+  si_MInst_sisi_sat_lh_s1         <"mpy",     int_hexagon_M2_mpy_sat_lh_s1>;
+def Hexagon_M2_mpy_rnd_lh_s0:
+  si_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpy_rnd_lh_s0>;
+def Hexagon_M2_mpy_sat_rnd_lh_s0:
+  si_MInst_sisi_sat_rnd_lh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s0>;
+def Hexagon_M2_mpy_sat_lh_s0:
+  si_MInst_sisi_sat_lh            <"mpy",     int_hexagon_M2_mpy_sat_lh_s0>;
+
+def Hexagon_M2_mpy_ll_s0:
+  si_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpy_ll_s0>;
+def Hexagon_M2_mpy_ll_s1:
+  si_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpy_ll_s1>;
+def Hexagon_M2_mpy_rnd_ll_s1:
+  si_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpy_rnd_ll_s1>;
+def Hexagon_M2_mpy_sat_rnd_ll_s1:
+  si_MInst_sisi_sat_rnd_ll_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s1>;
+def Hexagon_M2_mpy_sat_ll_s1:
+  si_MInst_sisi_sat_ll_s1         <"mpy",     int_hexagon_M2_mpy_sat_ll_s1>;
+def Hexagon_M2_mpy_rnd_ll_s0:
+  si_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpy_rnd_ll_s0>;
+def Hexagon_M2_mpy_sat_rnd_ll_s0:
+  si_MInst_sisi_sat_rnd_ll        <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s0>;
+def Hexagon_M2_mpy_sat_ll_s0:
+  si_MInst_sisi_sat_ll            <"mpy",     int_hexagon_M2_mpy_sat_ll_s0>;
+
+//Rdd=mpy(Rs.[H|L],Rt.[H|L])[[:<<0|:<<1]|[:<<0:rnd|:<<1:rnd]]
+def Hexagon_M2_mpyd_hh_s0:
+  di_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpyd_hh_s0>;
+def Hexagon_M2_mpyd_hh_s1:
+  di_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpyd_hh_s1>;
+def Hexagon_M2_mpyd_rnd_hh_s1:
+  di_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s1>;
+def Hexagon_M2_mpyd_rnd_hh_s0:
+  di_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s0>;
+
+def Hexagon_M2_mpyd_hl_s0:
+  di_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpyd_hl_s0>;
+def Hexagon_M2_mpyd_hl_s1:
+  di_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpyd_hl_s1>;
+def Hexagon_M2_mpyd_rnd_hl_s1:
+  di_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s1>;
+def Hexagon_M2_mpyd_rnd_hl_s0:
+  di_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s0>;
+
+def Hexagon_M2_mpyd_lh_s0:
+  di_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpyd_lh_s0>;
+def Hexagon_M2_mpyd_lh_s1:
+  di_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpyd_lh_s1>;
+def Hexagon_M2_mpyd_rnd_lh_s1:
+  di_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s1>;
+def Hexagon_M2_mpyd_rnd_lh_s0:
+  di_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s0>;
+
+def Hexagon_M2_mpyd_ll_s0:
+  di_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpyd_ll_s0>;
+def Hexagon_M2_mpyd_ll_s1:
+  di_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpyd_ll_s1>;
+def Hexagon_M2_mpyd_rnd_ll_s1:
+  di_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s1>;
+def Hexagon_M2_mpyd_rnd_ll_s0:
+  di_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s0>;
+
+//Rx+=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
+def Hexagon_M2_mpy_acc_hh_s0:
+  si_MInst_sisisi_acc_hh            <"mpy",  int_hexagon_M2_mpy_acc_hh_s0>;
+def Hexagon_M2_mpy_acc_hh_s1:
+  si_MInst_sisisi_acc_hh_s1         <"mpy",  int_hexagon_M2_mpy_acc_hh_s1>;
+def Hexagon_M2_mpy_acc_sat_hh_s1:
+  si_MInst_sisisi_acc_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s1>;
+def Hexagon_M2_mpy_acc_sat_hh_s0:
+  si_MInst_sisisi_acc_sat_hh        <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s0>;
+
+def Hexagon_M2_mpy_acc_hl_s0:
+  si_MInst_sisisi_acc_hl            <"mpy",  int_hexagon_M2_mpy_acc_hl_s0>;
+def Hexagon_M2_mpy_acc_hl_s1:
+  si_MInst_sisisi_acc_hl_s1         <"mpy",  int_hexagon_M2_mpy_acc_hl_s1>;
+def Hexagon_M2_mpy_acc_sat_hl_s1:
+  si_MInst_sisisi_acc_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s1>;
+def Hexagon_M2_mpy_acc_sat_hl_s0:
+  si_MInst_sisisi_acc_sat_hl        <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s0>;
+
+def Hexagon_M2_mpy_acc_lh_s0:
+  si_MInst_sisisi_acc_lh            <"mpy",  int_hexagon_M2_mpy_acc_lh_s0>;
+def Hexagon_M2_mpy_acc_lh_s1:
+  si_MInst_sisisi_acc_lh_s1         <"mpy",  int_hexagon_M2_mpy_acc_lh_s1>;
+def Hexagon_M2_mpy_acc_sat_lh_s1:
+  si_MInst_sisisi_acc_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s1>;
+def Hexagon_M2_mpy_acc_sat_lh_s0:
+  si_MInst_sisisi_acc_sat_lh        <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s0>;
+
+def Hexagon_M2_mpy_acc_ll_s0:
+  si_MInst_sisisi_acc_ll            <"mpy",  int_hexagon_M2_mpy_acc_ll_s0>;
+def Hexagon_M2_mpy_acc_ll_s1:
+  si_MInst_sisisi_acc_ll_s1         <"mpy",  int_hexagon_M2_mpy_acc_ll_s1>;
+def Hexagon_M2_mpy_acc_sat_ll_s1:
+  si_MInst_sisisi_acc_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s1>;
+def Hexagon_M2_mpy_acc_sat_ll_s0:
+  si_MInst_sisisi_acc_sat_ll        <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s0>;
+
+//Rx-=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
+def Hexagon_M2_mpy_nac_hh_s0:
+  si_MInst_sisisi_nac_hh            <"mpy",  int_hexagon_M2_mpy_nac_hh_s0>;
+def Hexagon_M2_mpy_nac_hh_s1:
+  si_MInst_sisisi_nac_hh_s1         <"mpy",  int_hexagon_M2_mpy_nac_hh_s1>;
+def Hexagon_M2_mpy_nac_sat_hh_s1:
+  si_MInst_sisisi_nac_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s1>;
+def Hexagon_M2_mpy_nac_sat_hh_s0:
+  si_MInst_sisisi_nac_sat_hh        <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s0>;
+
+def Hexagon_M2_mpy_nac_hl_s0:
+  si_MInst_sisisi_nac_hl            <"mpy",  int_hexagon_M2_mpy_nac_hl_s0>;
+def Hexagon_M2_mpy_nac_hl_s1:
+  si_MInst_sisisi_nac_hl_s1         <"mpy",  int_hexagon_M2_mpy_nac_hl_s1>;
+def Hexagon_M2_mpy_nac_sat_hl_s1:
+  si_MInst_sisisi_nac_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s1>;
+def Hexagon_M2_mpy_nac_sat_hl_s0:
+  si_MInst_sisisi_nac_sat_hl        <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s0>;
+
+def Hexagon_M2_mpy_nac_lh_s0:
+  si_MInst_sisisi_nac_lh            <"mpy",  int_hexagon_M2_mpy_nac_lh_s0>;
+def Hexagon_M2_mpy_nac_lh_s1:
+  si_MInst_sisisi_nac_lh_s1         <"mpy",  int_hexagon_M2_mpy_nac_lh_s1>;
+def Hexagon_M2_mpy_nac_sat_lh_s1:
+  si_MInst_sisisi_nac_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s1>;
+def Hexagon_M2_mpy_nac_sat_lh_s0:
+  si_MInst_sisisi_nac_sat_lh        <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s0>;
+
+def Hexagon_M2_mpy_nac_ll_s0:
+  si_MInst_sisisi_nac_ll            <"mpy",  int_hexagon_M2_mpy_nac_ll_s0>;
+def Hexagon_M2_mpy_nac_ll_s1:
+  si_MInst_sisisi_nac_ll_s1         <"mpy",  int_hexagon_M2_mpy_nac_ll_s1>;
+def Hexagon_M2_mpy_nac_sat_ll_s1:
+  si_MInst_sisisi_nac_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s1>;
+def Hexagon_M2_mpy_nac_sat_ll_s0:
+  si_MInst_sisisi_nac_sat_ll        <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s0>;
+
+//Rx+=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
+def Hexagon_M2_mpyd_acc_hh_s0:
+  di_MInst_disisi_acc_hh          <"mpy",    int_hexagon_M2_mpyd_acc_hh_s0>;
+def Hexagon_M2_mpyd_acc_hh_s1:
+  di_MInst_disisi_acc_hh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hh_s1>;
+
+def Hexagon_M2_mpyd_acc_hl_s0:
+  di_MInst_disisi_acc_hl          <"mpy",    int_hexagon_M2_mpyd_acc_hl_s0>;
+def Hexagon_M2_mpyd_acc_hl_s1:
+  di_MInst_disisi_acc_hl_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hl_s1>;
+
+def Hexagon_M2_mpyd_acc_lh_s0:
+  di_MInst_disisi_acc_lh          <"mpy",    int_hexagon_M2_mpyd_acc_lh_s0>;
+def Hexagon_M2_mpyd_acc_lh_s1:
+  di_MInst_disisi_acc_lh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_lh_s1>;
+
+def Hexagon_M2_mpyd_acc_ll_s0:
+  di_MInst_disisi_acc_ll          <"mpy",    int_hexagon_M2_mpyd_acc_ll_s0>;
+def Hexagon_M2_mpyd_acc_ll_s1:
+  di_MInst_disisi_acc_ll_s1       <"mpy",    int_hexagon_M2_mpyd_acc_ll_s1>;
+
+//Rx-=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
+def Hexagon_M2_mpyd_nac_hh_s0:
+  di_MInst_disisi_nac_hh          <"mpy",    int_hexagon_M2_mpyd_nac_hh_s0>;
+def Hexagon_M2_mpyd_nac_hh_s1:
+  di_MInst_disisi_nac_hh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hh_s1>;
+
+def Hexagon_M2_mpyd_nac_hl_s0:
+  di_MInst_disisi_nac_hl          <"mpy",    int_hexagon_M2_mpyd_nac_hl_s0>;
+def Hexagon_M2_mpyd_nac_hl_s1:
+  di_MInst_disisi_nac_hl_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hl_s1>;
+
+def Hexagon_M2_mpyd_nac_lh_s0:
+  di_MInst_disisi_nac_lh          <"mpy",    int_hexagon_M2_mpyd_nac_lh_s0>;
+def Hexagon_M2_mpyd_nac_lh_s1:
+  di_MInst_disisi_nac_lh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_lh_s1>;
+
+def Hexagon_M2_mpyd_nac_ll_s0:
+  di_MInst_disisi_nac_ll          <"mpy",    int_hexagon_M2_mpyd_nac_ll_s0>;
+def Hexagon_M2_mpyd_nac_ll_s1:
+  di_MInst_disisi_nac_ll_s1       <"mpy",    int_hexagon_M2_mpyd_nac_ll_s1>;
+
+// MTYPE / MPYS / Scalar 16x16 multiply unsigned.
+//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def Hexagon_M2_mpyu_hh_s0:
+  si_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyu_hh_s0>;
+def Hexagon_M2_mpyu_hh_s1:
+  si_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyu_hh_s1>;
+def Hexagon_M2_mpyu_hl_s0:
+  si_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyu_hl_s0>;
+def Hexagon_M2_mpyu_hl_s1:
+  si_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyu_hl_s1>;
+def Hexagon_M2_mpyu_lh_s0:
+  si_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyu_lh_s0>;
+def Hexagon_M2_mpyu_lh_s1:
+  si_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyu_lh_s1>;
+def Hexagon_M2_mpyu_ll_s0:
+  si_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyu_ll_s0>;
+def Hexagon_M2_mpyu_ll_s1:
+  si_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyu_ll_s1>;
+
+//Rdd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def Hexagon_M2_mpyud_hh_s0:
+  di_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyud_hh_s0>;
+def Hexagon_M2_mpyud_hh_s1:
+  di_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyud_hh_s1>;
+def Hexagon_M2_mpyud_hl_s0:
+  di_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyud_hl_s0>;
+def Hexagon_M2_mpyud_hl_s1:
+  di_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyud_hl_s1>;
+def Hexagon_M2_mpyud_lh_s0:
+  di_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyud_lh_s0>;
+def Hexagon_M2_mpyud_lh_s1:
+  di_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyud_lh_s1>;
+def Hexagon_M2_mpyud_ll_s0:
+  di_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyud_ll_s0>;
+def Hexagon_M2_mpyud_ll_s1:
+  di_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyud_ll_s1>;
+
+//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def Hexagon_M2_mpyu_acc_hh_s0:
+  si_MInst_sisisi_acc_hh            <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s0>;
+def Hexagon_M2_mpyu_acc_hh_s1:
+  si_MInst_sisisi_acc_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s1>;
+def Hexagon_M2_mpyu_acc_hl_s0:
+  si_MInst_sisisi_acc_hl            <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s0>;
+def Hexagon_M2_mpyu_acc_hl_s1:
+  si_MInst_sisisi_acc_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s1>;
+def Hexagon_M2_mpyu_acc_lh_s0:
+  si_MInst_sisisi_acc_lh            <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s0>;
+def Hexagon_M2_mpyu_acc_lh_s1:
+  si_MInst_sisisi_acc_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s1>;
+def Hexagon_M2_mpyu_acc_ll_s0:
+  si_MInst_sisisi_acc_ll            <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s0>;
+def Hexagon_M2_mpyu_acc_ll_s1:
+  si_MInst_sisisi_acc_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s1>;
+
+//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def Hexagon_M2_mpyu_nac_hh_s0:
+  si_MInst_sisisi_nac_hh            <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s0>;
+def Hexagon_M2_mpyu_nac_hh_s1:
+  si_MInst_sisisi_nac_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s1>;
+def Hexagon_M2_mpyu_nac_hl_s0:
+  si_MInst_sisisi_nac_hl            <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s0>;
+def Hexagon_M2_mpyu_nac_hl_s1:
+  si_MInst_sisisi_nac_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s1>;
+def Hexagon_M2_mpyu_nac_lh_s0:
+  si_MInst_sisisi_nac_lh            <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s0>;
+def Hexagon_M2_mpyu_nac_lh_s1:
+  si_MInst_sisisi_nac_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s1>;
+def Hexagon_M2_mpyu_nac_ll_s0:
+  si_MInst_sisisi_nac_ll            <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s0>;
+def Hexagon_M2_mpyu_nac_ll_s1:
+  si_MInst_sisisi_nac_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s1>;
+
+//Rdd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def Hexagon_M2_mpyud_acc_hh_s0:
+  di_MInst_disisi_acc_hh            <"mpyu", int_hexagon_M2_mpyud_acc_hh_s0>;
+def Hexagon_M2_mpyud_acc_hh_s1:
+  di_MInst_disisi_acc_hh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hh_s1>;
+def Hexagon_M2_mpyud_acc_hl_s0:
+  di_MInst_disisi_acc_hl            <"mpyu", int_hexagon_M2_mpyud_acc_hl_s0>;
+def Hexagon_M2_mpyud_acc_hl_s1:
+  di_MInst_disisi_acc_hl_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hl_s1>;
+def Hexagon_M2_mpyud_acc_lh_s0:
+  di_MInst_disisi_acc_lh            <"mpyu", int_hexagon_M2_mpyud_acc_lh_s0>;
+def Hexagon_M2_mpyud_acc_lh_s1:
+  di_MInst_disisi_acc_lh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_lh_s1>;
+def Hexagon_M2_mpyud_acc_ll_s0:
+  di_MInst_disisi_acc_ll            <"mpyu", int_hexagon_M2_mpyud_acc_ll_s0>;
+def Hexagon_M2_mpyud_acc_ll_s1:
+  di_MInst_disisi_acc_ll_s1         <"mpyu", int_hexagon_M2_mpyud_acc_ll_s1>;
+
+//Rdd-=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def Hexagon_M2_mpyud_nac_hh_s0:
+  di_MInst_disisi_nac_hh            <"mpyu", int_hexagon_M2_mpyud_nac_hh_s0>;
+def Hexagon_M2_mpyud_nac_hh_s1:
+  di_MInst_disisi_nac_hh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hh_s1>;
+def Hexagon_M2_mpyud_nac_hl_s0:
+  di_MInst_disisi_nac_hl            <"mpyu", int_hexagon_M2_mpyud_nac_hl_s0>;
+def Hexagon_M2_mpyud_nac_hl_s1:
+  di_MInst_disisi_nac_hl_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hl_s1>;
+def Hexagon_M2_mpyud_nac_lh_s0:
+  di_MInst_disisi_nac_lh            <"mpyu", int_hexagon_M2_mpyud_nac_lh_s0>;
+def Hexagon_M2_mpyud_nac_lh_s1:
+  di_MInst_disisi_nac_lh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_lh_s1>;
+def Hexagon_M2_mpyud_nac_ll_s0:
+  di_MInst_disisi_nac_ll            <"mpyu", int_hexagon_M2_mpyud_nac_ll_s0>;
+def Hexagon_M2_mpyud_nac_ll_s1:
+  di_MInst_disisi_nac_ll_s1         <"mpyu", int_hexagon_M2_mpyud_nac_ll_s1>;
+
+
+/********************************************************************
+*            MTYPE/VB                                               *
+*********************************************************************/
+
+// MTYPE / VB / Vector reduce add unsigned bytes.
+def Hexagon_A2_vraddub:
+  di_MInst_didi                   <"vraddub", int_hexagon_A2_vraddub>;
+def Hexagon_A2_vraddub_acc:
+  di_MInst_dididi_acc             <"vraddub", int_hexagon_A2_vraddub_acc>;
+
+// MTYPE / VB / Vector sum of absolute differences unsigned bytes.
+def Hexagon_A2_vrsadub:
+  di_MInst_didi                   <"vrsadub", int_hexagon_A2_vrsadub>;
+def Hexagon_A2_vrsadub_acc:
+  di_MInst_dididi_acc             <"vrsadub", int_hexagon_A2_vrsadub_acc>;
+
+/********************************************************************
+*            MTYPE/VH                                               *
+*********************************************************************/
+
+// MTYPE / VH / Vector dual multiply.
+def Hexagon_M2_vdmpys_s1:
+  di_MInst_didi_s1_sat            <"vdmpy",   int_hexagon_M2_vdmpys_s1>;
+def Hexagon_M2_vdmpys_s0:
+  di_MInst_didi_sat               <"vdmpy",   int_hexagon_M2_vdmpys_s0>;
+def Hexagon_M2_vdmacs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vdmpy",   int_hexagon_M2_vdmacs_s1>;
+def Hexagon_M2_vdmacs_s0:
+  di_MInst_dididi_acc_sat         <"vdmpy",   int_hexagon_M2_vdmacs_s0>;
+
+// MTYPE / VH / Vector dual multiply with round and pack.
+def Hexagon_M2_vdmpyrs_s0:
+  si_MInst_didi_rnd_sat           <"vdmpy",   int_hexagon_M2_vdmpyrs_s0>;
+def Hexagon_M2_vdmpyrs_s1:
+  si_MInst_didi_s1_rnd_sat        <"vdmpy",   int_hexagon_M2_vdmpyrs_s1>;
+
+// MTYPE / VH / Vector multiply even halfwords.
+def Hexagon_M2_vmpy2es_s1:
+  di_MInst_didi_s1_sat            <"vmpyeh",  int_hexagon_M2_vmpy2es_s1>;
+def Hexagon_M2_vmpy2es_s0:
+  di_MInst_didi_sat               <"vmpyeh",  int_hexagon_M2_vmpy2es_s0>;
+def Hexagon_M2_vmac2es:
+  di_MInst_dididi_acc             <"vmpyeh",  int_hexagon_M2_vmac2es>;
+def Hexagon_M2_vmac2es_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyeh",  int_hexagon_M2_vmac2es_s1>;
+def Hexagon_M2_vmac2es_s0:
+  di_MInst_dididi_acc_sat         <"vmpyeh",  int_hexagon_M2_vmac2es_s0>;
+
+// MTYPE / VH / Vector multiply halfwords.
+def Hexagon_M2_vmpy2s_s0:
+  di_MInst_sisi_sat               <"vmpyh",   int_hexagon_M2_vmpy2s_s0>;
+def Hexagon_M2_vmpy2s_s1:
+  di_MInst_sisi_s1_sat            <"vmpyh",   int_hexagon_M2_vmpy2s_s1>;
+def Hexagon_M2_vmac2:
+  di_MInst_disisi_acc             <"vmpyh",   int_hexagon_M2_vmac2>;
+def Hexagon_M2_vmac2s_s0:
+  di_MInst_disisi_acc_sat         <"vmpyh",   int_hexagon_M2_vmac2s_s0>;
+def Hexagon_M2_vmac2s_s1:
+  di_MInst_disisi_acc_s1_sat      <"vmpyh",   int_hexagon_M2_vmac2s_s1>;
+
+// MTYPE / VH / Vector multiply halfwords with round and pack.
+def Hexagon_M2_vmpy2s_s0pack:
+  si_MInst_sisi_rnd_sat           <"vmpyh",   int_hexagon_M2_vmpy2s_s0pack>;
+def Hexagon_M2_vmpy2s_s1pack:
+  si_MInst_sisi_s1_rnd_sat        <"vmpyh",   int_hexagon_M2_vmpy2s_s1pack>;
+
+// MTYPE / VH / Vector reduce multiply halfwords.
+// Rxx32+=vrmpyh(Rss32,Rtt32)
+def Hexagon_M2_vrmpy_s0:
+  di_MInst_didi                   <"vrmpyh",  int_hexagon_M2_vrmpy_s0>;
+def Hexagon_M2_vrmac_s0:
+  di_MInst_dididi_acc             <"vrmpyh",  int_hexagon_M2_vrmac_s0>;
+
+
+/********************************************************************
+*            STYPE/ALU                                              *
+*********************************************************************/
+
+// STYPE / ALU / Absolute value.
+def Hexagon_A2_abs:
+  si_SInst_si                     <"abs",     int_hexagon_A2_abs>;
+def Hexagon_A2_absp:
+  di_SInst_di                     <"abs",     int_hexagon_A2_absp>;
+def Hexagon_A2_abssat:
+  si_SInst_si_sat                 <"abs",     int_hexagon_A2_abssat>;
+
+// STYPE / ALU / Negate.
+def Hexagon_A2_negp:
+  di_SInst_di                     <"neg",     int_hexagon_A2_negp>;
+def Hexagon_A2_negsat:
+  si_SInst_si_sat                 <"neg",     int_hexagon_A2_negsat>;
+
+// STYPE / ALU / Logical Not.
+def Hexagon_A2_notp:
+  di_SInst_di                     <"not",     int_hexagon_A2_notp>;
+
+// STYPE / ALU / Sign extend word to doubleword.
+def Hexagon_A2_sxtw:
+  di_SInst_si                     <"sxtw",     int_hexagon_A2_sxtw>;
+
+
+/********************************************************************
+*            STYPE/BIT                                              *
+*********************************************************************/
+
+// STYPE / BIT / Count leading.
+def Hexagon_S2_cl0:
+  si_SInst_si                     <"cl0",     int_hexagon_S2_cl0>;
+def Hexagon_S2_cl0p:
+  si_SInst_di                     <"cl0",     int_hexagon_S2_cl0p>;
+def Hexagon_S2_cl1:
+  si_SInst_si                     <"cl1",     int_hexagon_S2_cl1>;
+def Hexagon_S2_cl1p:
+  si_SInst_di                     <"cl1",     int_hexagon_S2_cl1p>;
+def Hexagon_S2_clb:
+  si_SInst_si                     <"clb",     int_hexagon_S2_clb>;
+def Hexagon_S2_clbp:
+  si_SInst_di                     <"clb",     int_hexagon_S2_clbp>;
+def Hexagon_S2_clbnorm:
+  si_SInst_si                     <"normamt", int_hexagon_S2_clbnorm>;
+
+// STYPE / BIT / Count trailing.
+def Hexagon_S2_ct0:
+  si_SInst_si                     <"ct0",     int_hexagon_S2_ct0>;
+def Hexagon_S2_ct1:
+  si_SInst_si                     <"ct1",     int_hexagon_S2_ct1>;
+
+// STYPE / BIT / Compare bit mask.
+def HEXAGON_C2_bitsclr:
+  qi_SInst_sisi                   <"bitsclr", int_hexagon_C2_bitsclr>;
+def HEXAGON_C2_bitsclri:
+  qi_SInst_siu6                   <"bitsclr", int_hexagon_C2_bitsclri>;
+def HEXAGON_C2_bitsset:
+  qi_SInst_sisi                   <"bitsset", int_hexagon_C2_bitsset>;
+
+// STYPE / BIT / Extract unsigned.
+// Rd[d][32/64]=extractu(Rs[s],Rt[t],[imm])
+def Hexagon_S2_extractu:
+  si_SInst_siu5u5                 <"extractu",int_hexagon_S2_extractu>;
+def Hexagon_S2_extractu_rp:
+  si_SInst_sidi                   <"extractu",int_hexagon_S2_extractu_rp>;
+def Hexagon_S2_extractup:
+  di_SInst_diu6u6                 <"extractu",int_hexagon_S2_extractup>;
+def Hexagon_S2_extractup_rp:
+  di_SInst_didi                   <"extractu",int_hexagon_S2_extractup_rp>;
+
+// STYPE / BIT / Insert bitfield.
+def HEXAGON_S2_insert:
+  si_SInst_sisiu5u5               <"insert",  int_hexagon_S2_insert>;
+def HEXAGON_S2_insert_rp:
+  si_SInst_sisidi                 <"insert",  int_hexagon_S2_insert_rp>;
+def HEXAGON_S2_insertp:
+  di_SInst_didiu6u6               <"insert",  int_hexagon_S2_insertp>;
+def HEXAGON_S2_insertp_rp:
+  di_SInst_dididi                 <"insert",  int_hexagon_S2_insertp_rp>;
+
+// STYPE / BIT / Innterleave/deinterleave.
+def HEXAGON_S2_interleave:
+  di_SInst_di                     <"interleave", int_hexagon_S2_interleave>;
+def HEXAGON_S2_deinterleave:
+  di_SInst_di                     <"deinterleave", int_hexagon_S2_deinterleave>;
+
+// STYPE / BIT / Linear feedback-shift Iteration.
+def HEXAGON_S2_lfsp:
+  di_SInst_didi                   <"lfs",     int_hexagon_S2_lfsp>;
+
+// STYPE / BIT / Bit reverse.
+def HEXAGON_S2_brev:
+  si_SInst_si                     <"brev",    int_hexagon_S2_brev>;
+
+// STYPE / BIT / Set/Clear/Toggle Bit.
+def Hexagon_S2_setbit_i:
+  si_SInst_siu5                   <"setbit",  int_hexagon_S2_setbit_i>;
+def Hexagon_S2_togglebit_i:
+  si_SInst_siu5                   <"togglebit", int_hexagon_S2_togglebit_i>;
+def Hexagon_S2_clrbit_i:
+  si_SInst_siu5                   <"clrbit",  int_hexagon_S2_clrbit_i>;
+def Hexagon_S2_setbit_r:
+  si_SInst_sisi                   <"setbit",  int_hexagon_S2_setbit_r>;
+def Hexagon_S2_togglebit_r:
+  si_SInst_sisi                   <"togglebit", int_hexagon_S2_togglebit_r>;
+def Hexagon_S2_clrbit_r:
+  si_SInst_sisi                   <"clrbit",  int_hexagon_S2_clrbit_r>;
+
+// STYPE / BIT / Test Bit.
+def Hexagon_S2_tstbit_i:
+  qi_SInst_siu5                   <"tstbit",  int_hexagon_S2_tstbit_i>;
+def Hexagon_S2_tstbit_r:
+  qi_SInst_sisi                   <"tstbit",  int_hexagon_S2_tstbit_r>;
+
+
+/********************************************************************
+*            STYPE/COMPLEX                                          *
+*********************************************************************/
+
+// STYPE / COMPLEX / Vector Complex conjugate.
+def Hexagon_A2_vconj:
+  di_SInst_di_sat                 <"vconj",   int_hexagon_A2_vconj>;
+
+// STYPE / COMPLEX / Vector Complex rotate.
+def Hexagon_S2_vcrotate:
+  di_SInst_disi                   <"vcrotate",int_hexagon_S2_vcrotate>;
+
+
+/********************************************************************
+*            STYPE/PERM                                             *
+*********************************************************************/
+
+// STYPE / PERM / Saturate.
+def Hexagon_A2_sat:
+  si_SInst_di                     <"sat",     int_hexagon_A2_sat>;
+def Hexagon_A2_satb:
+  si_SInst_si                     <"satb",    int_hexagon_A2_satb>;
+def Hexagon_A2_sath:
+  si_SInst_si                     <"sath",    int_hexagon_A2_sath>;
+def Hexagon_A2_satub:
+  si_SInst_si                     <"satub",   int_hexagon_A2_satub>;
+def Hexagon_A2_satuh:
+  si_SInst_si                     <"satuh",   int_hexagon_A2_satuh>;
+
+// STYPE / PERM / Swizzle bytes.
+def Hexagon_A2_swiz:
+  si_SInst_si                     <"swiz",    int_hexagon_A2_swiz>;
+
+// STYPE / PERM / Vector align.
+// Need custom lowering
+def Hexagon_S2_valignib:
+  di_SInst_didiu3                 <"valignb", int_hexagon_S2_valignib>;
+def Hexagon_S2_valignrb:
+  di_SInst_didiqi                 <"valignb", int_hexagon_S2_valignrb>;
+
+// STYPE / PERM / Vector round and pack.
+def Hexagon_S2_vrndpackwh:
+  si_SInst_di                     <"vrndwh",  int_hexagon_S2_vrndpackwh>;
+def Hexagon_S2_vrndpackwhs:
+  si_SInst_di_sat                 <"vrndwh",  int_hexagon_S2_vrndpackwhs>;
+
+// STYPE / PERM / Vector saturate and pack.
+def Hexagon_S2_svsathb:
+  si_SInst_si                     <"vsathb",  int_hexagon_S2_svsathb>;
+def Hexagon_S2_vsathb:
+  si_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb>;
+def Hexagon_S2_svsathub:
+  si_SInst_si                     <"vsathub", int_hexagon_S2_svsathub>;
+def Hexagon_S2_vsathub:
+  si_SInst_di                     <"vsathub", int_hexagon_S2_vsathub>;
+def Hexagon_S2_vsatwh:
+  si_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh>;
+def Hexagon_S2_vsatwuh:
+  si_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh>;
+
+// STYPE / PERM / Vector saturate without pack.
+def Hexagon_S2_vsathb_nopack:
+  di_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb_nopack>;
+def Hexagon_S2_vsathub_nopack:
+  di_SInst_di                     <"vsathub", int_hexagon_S2_vsathub_nopack>;
+def Hexagon_S2_vsatwh_nopack:
+  di_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh_nopack>;
+def Hexagon_S2_vsatwuh_nopack:
+  di_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh_nopack>;
+
+// STYPE / PERM / Vector shuffle.
+def Hexagon_S2_shuffeb:
+  di_SInst_didi                   <"shuffeb", int_hexagon_S2_shuffeb>;
+def Hexagon_S2_shuffeh:
+  di_SInst_didi                   <"shuffeh", int_hexagon_S2_shuffeh>;
+def Hexagon_S2_shuffob:
+  di_SInst_didi                   <"shuffob", int_hexagon_S2_shuffob>;
+def Hexagon_S2_shuffoh:
+  di_SInst_didi                   <"shuffoh", int_hexagon_S2_shuffoh>;
+
+// STYPE / PERM / Vector splat bytes.
+def Hexagon_S2_vsplatrb:
+  si_SInst_si                     <"vsplatb", int_hexagon_S2_vsplatrb>;
+
+// STYPE / PERM / Vector splat halfwords.
+def Hexagon_S2_vsplatrh:
+  di_SInst_si                     <"vsplath", int_hexagon_S2_vsplatrh>;
+
+// STYPE / PERM / Vector splice.
+def HEXAGON_S2_vsplicerb:
+  di_SInst_didiqi                 <"vspliceb",int_hexagon_S2_vsplicerb>;
+def HEXAGON_S2_vspliceib:
+  di_SInst_didiu3                 <"vspliceb",int_hexagon_S2_vspliceib>;
+
+// STYPE / PERM / Sign extend.
+def Hexagon_S2_vsxtbh:
+  di_SInst_si                     <"vsxtbh",  int_hexagon_S2_vsxtbh>;
+def Hexagon_S2_vsxthw:
+  di_SInst_si                     <"vsxthw",  int_hexagon_S2_vsxthw>;
+
+// STYPE / PERM / Truncate.
+def Hexagon_S2_vtrunehb:
+  si_SInst_di                     <"vtrunehb",int_hexagon_S2_vtrunehb>;
+def Hexagon_S2_vtrunohb:
+  si_SInst_di                     <"vtrunohb",int_hexagon_S2_vtrunohb>;
+def Hexagon_S2_vtrunewh:
+  di_SInst_didi                   <"vtrunewh",int_hexagon_S2_vtrunewh>;
+def Hexagon_S2_vtrunowh:
+  di_SInst_didi                   <"vtrunowh",int_hexagon_S2_vtrunowh>;
+
+// STYPE / PERM / Zero extend.
+def Hexagon_S2_vzxtbh:
+  di_SInst_si                     <"vzxtbh",  int_hexagon_S2_vzxtbh>;
+def Hexagon_S2_vzxthw:
+  di_SInst_si                     <"vzxthw",  int_hexagon_S2_vzxthw>;
+
+
+/********************************************************************
+*            STYPE/PRED                                             *
+*********************************************************************/
+
+// STYPE / PRED / Mask generate from predicate.
+def Hexagon_C2_mask:
+  di_SInst_qi                     <"mask",   int_hexagon_C2_mask>;
+
+// STYPE / PRED / Predicate transfer.
+def Hexagon_C2_tfrpr:
+  si_SInst_qi                     <"",       int_hexagon_C2_tfrpr>;
+def Hexagon_C2_tfrrp:
+  qi_SInst_si                     <"",       int_hexagon_C2_tfrrp>;
+
+// STYPE / PRED / Viterbi pack even and odd predicate bits.
+def Hexagon_C2_vitpack:
+  si_SInst_qiqi                   <"vitpack",int_hexagon_C2_vitpack>;
+
+
+/********************************************************************
+*            STYPE/SHIFT                                            *
+*********************************************************************/
+
+// STYPE / SHIFT / Shift by immediate.
+def Hexagon_S2_asl_i_r:
+  si_SInst_siu5                   <"asl",     int_hexagon_S2_asl_i_r>;
+def Hexagon_S2_asr_i_r:
+  si_SInst_siu5                   <"asr",     int_hexagon_S2_asr_i_r>;
+def Hexagon_S2_lsr_i_r:
+  si_SInst_siu5                   <"lsr",     int_hexagon_S2_lsr_i_r>;
+def Hexagon_S2_asl_i_p:
+  di_SInst_diu6                   <"asl",     int_hexagon_S2_asl_i_p>;
+def Hexagon_S2_asr_i_p:
+  di_SInst_diu6                   <"asr",     int_hexagon_S2_asr_i_p>;
+def Hexagon_S2_lsr_i_p:
+  di_SInst_diu6                   <"lsr",     int_hexagon_S2_lsr_i_p>;
+
+// STYPE / SHIFT / Shift by immediate and accumulate.
+def Hexagon_S2_asl_i_r_acc:
+  si_SInst_sisiu5_acc             <"asl",     int_hexagon_S2_asl_i_r_acc>;
+def Hexagon_S2_asr_i_r_acc:
+  si_SInst_sisiu5_acc             <"asr",     int_hexagon_S2_asr_i_r_acc>;
+def Hexagon_S2_lsr_i_r_acc:
+  si_SInst_sisiu5_acc             <"lsr",     int_hexagon_S2_lsr_i_r_acc>;
+def Hexagon_S2_asl_i_r_nac:
+  si_SInst_sisiu5_nac             <"asl",     int_hexagon_S2_asl_i_r_nac>;
+def Hexagon_S2_asr_i_r_nac:
+  si_SInst_sisiu5_nac             <"asr",     int_hexagon_S2_asr_i_r_nac>;
+def Hexagon_S2_lsr_i_r_nac:
+  si_SInst_sisiu5_nac             <"lsr",     int_hexagon_S2_lsr_i_r_nac>;
+def Hexagon_S2_asl_i_p_acc:
+  di_SInst_didiu6_acc             <"asl",     int_hexagon_S2_asl_i_p_acc>;
+def Hexagon_S2_asr_i_p_acc:
+  di_SInst_didiu6_acc             <"asr",     int_hexagon_S2_asr_i_p_acc>;
+def Hexagon_S2_lsr_i_p_acc:
+  di_SInst_didiu6_acc             <"lsr",     int_hexagon_S2_lsr_i_p_acc>;
+def Hexagon_S2_asl_i_p_nac:
+  di_SInst_didiu6_nac             <"asl",     int_hexagon_S2_asl_i_p_nac>;
+def Hexagon_S2_asr_i_p_nac:
+  di_SInst_didiu6_nac             <"asr",     int_hexagon_S2_asr_i_p_nac>;
+def Hexagon_S2_lsr_i_p_nac:
+  di_SInst_didiu6_nac             <"lsr",     int_hexagon_S2_lsr_i_p_nac>;
+
+// STYPE / SHIFT / Shift by immediate and add.
+def Hexagon_S2_addasl_rrri:
+  si_SInst_sisiu3                 <"addasl",  int_hexagon_S2_addasl_rrri>;
+
+// STYPE / SHIFT / Shift by immediate and logical.
+def Hexagon_S2_asl_i_r_and:
+  si_SInst_sisiu5_and             <"asl",     int_hexagon_S2_asl_i_r_and>;
+def Hexagon_S2_asr_i_r_and:
+  si_SInst_sisiu5_and             <"asr",     int_hexagon_S2_asr_i_r_and>;
+def Hexagon_S2_lsr_i_r_and:
+  si_SInst_sisiu5_and             <"lsr",     int_hexagon_S2_lsr_i_r_and>;
+
+def Hexagon_S2_asl_i_r_xacc:
+  si_SInst_sisiu5_xor             <"asl",     int_hexagon_S2_asl_i_r_xacc>;
+def Hexagon_S2_lsr_i_r_xacc:
+  si_SInst_sisiu5_xor             <"lsr",     int_hexagon_S2_lsr_i_r_xacc>;
+
+def Hexagon_S2_asl_i_r_or:
+  si_SInst_sisiu5_or              <"asl",     int_hexagon_S2_asl_i_r_or>;
+def Hexagon_S2_asr_i_r_or:
+  si_SInst_sisiu5_or              <"asr",     int_hexagon_S2_asr_i_r_or>;
+def Hexagon_S2_lsr_i_r_or:
+  si_SInst_sisiu5_or              <"lsr",     int_hexagon_S2_lsr_i_r_or>;
+
+def Hexagon_S2_asl_i_p_and:
+  di_SInst_didiu6_and             <"asl",     int_hexagon_S2_asl_i_p_and>;
+def Hexagon_S2_asr_i_p_and:
+  di_SInst_didiu6_and             <"asr",     int_hexagon_S2_asr_i_p_and>;
+def Hexagon_S2_lsr_i_p_and:
+  di_SInst_didiu6_and             <"lsr",     int_hexagon_S2_lsr_i_p_and>;
+
+def Hexagon_S2_asl_i_p_xacc:
+  di_SInst_didiu6_xor             <"asl",     int_hexagon_S2_asl_i_p_xacc>;
+def Hexagon_S2_lsr_i_p_xacc:
+  di_SInst_didiu6_xor             <"lsr",     int_hexagon_S2_lsr_i_p_xacc>;
+
+def Hexagon_S2_asl_i_p_or:
+  di_SInst_didiu6_or              <"asl",     int_hexagon_S2_asl_i_p_or>;
+def Hexagon_S2_asr_i_p_or:
+  di_SInst_didiu6_or              <"asr",     int_hexagon_S2_asr_i_p_or>;
+def Hexagon_S2_lsr_i_p_or:
+  di_SInst_didiu6_or              <"lsr",     int_hexagon_S2_lsr_i_p_or>;
+
+// STYPE / SHIFT / Shift right by immediate with rounding.
+def Hexagon_S2_asr_i_r_rnd:
+  si_SInst_siu5_rnd               <"asr",     int_hexagon_S2_asr_i_r_rnd>;
+def Hexagon_S2_asr_i_r_rnd_goodsyntax:
+  si_SInst_siu5              <"asrrnd",  int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
+
+// STYPE / SHIFT / Shift left by immediate with saturation.
+def Hexagon_S2_asl_i_r_sat:
+  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_i_r_sat>;
+
+// STYPE / SHIFT / Shift by register.
+def Hexagon_S2_asl_r_r:
+  si_SInst_sisi                   <"asl",     int_hexagon_S2_asl_r_r>;
+def Hexagon_S2_asr_r_r:
+  si_SInst_sisi                   <"asr",     int_hexagon_S2_asr_r_r>;
+def Hexagon_S2_lsl_r_r:
+  si_SInst_sisi                   <"lsl",     int_hexagon_S2_lsl_r_r>;
+def Hexagon_S2_lsr_r_r:
+  si_SInst_sisi                   <"lsr",     int_hexagon_S2_lsr_r_r>;
+def Hexagon_S2_asl_r_p:
+  di_SInst_disi                   <"asl",     int_hexagon_S2_asl_r_p>;
+def Hexagon_S2_asr_r_p:
+  di_SInst_disi                   <"asr",     int_hexagon_S2_asr_r_p>;
+def Hexagon_S2_lsl_r_p:
+  di_SInst_disi                   <"lsl",     int_hexagon_S2_lsl_r_p>;
+def Hexagon_S2_lsr_r_p:
+  di_SInst_disi                   <"lsr",     int_hexagon_S2_lsr_r_p>;
+
+// STYPE / SHIFT / Shift by register and accumulate.
+def Hexagon_S2_asl_r_r_acc:
+  si_SInst_sisisi_acc             <"asl",     int_hexagon_S2_asl_r_r_acc>;
+def Hexagon_S2_asr_r_r_acc:
+  si_SInst_sisisi_acc             <"asr",     int_hexagon_S2_asr_r_r_acc>;
+def Hexagon_S2_lsl_r_r_acc:
+  si_SInst_sisisi_acc             <"lsl",     int_hexagon_S2_lsl_r_r_acc>;
+def Hexagon_S2_lsr_r_r_acc:
+  si_SInst_sisisi_acc             <"lsr",     int_hexagon_S2_lsr_r_r_acc>;
+def Hexagon_S2_asl_r_p_acc:
+  di_SInst_didisi_acc             <"asl",     int_hexagon_S2_asl_r_p_acc>;
+def Hexagon_S2_asr_r_p_acc:
+  di_SInst_didisi_acc             <"asr",     int_hexagon_S2_asr_r_p_acc>;
+def Hexagon_S2_lsl_r_p_acc:
+  di_SInst_didisi_acc             <"lsl",     int_hexagon_S2_lsl_r_p_acc>;
+def Hexagon_S2_lsr_r_p_acc:
+  di_SInst_didisi_acc             <"lsr",     int_hexagon_S2_lsr_r_p_acc>;
+
+def Hexagon_S2_asl_r_r_nac:
+  si_SInst_sisisi_nac             <"asl",     int_hexagon_S2_asl_r_r_nac>;
+def Hexagon_S2_asr_r_r_nac:
+  si_SInst_sisisi_nac             <"asr",     int_hexagon_S2_asr_r_r_nac>;
+def Hexagon_S2_lsl_r_r_nac:
+  si_SInst_sisisi_nac             <"lsl",     int_hexagon_S2_lsl_r_r_nac>;
+def Hexagon_S2_lsr_r_r_nac:
+  si_SInst_sisisi_nac             <"lsr",     int_hexagon_S2_lsr_r_r_nac>;
+def Hexagon_S2_asl_r_p_nac:
+  di_SInst_didisi_nac             <"asl",     int_hexagon_S2_asl_r_p_nac>;
+def Hexagon_S2_asr_r_p_nac:
+  di_SInst_didisi_nac             <"asr",     int_hexagon_S2_asr_r_p_nac>;
+def Hexagon_S2_lsl_r_p_nac:
+  di_SInst_didisi_nac             <"lsl",     int_hexagon_S2_lsl_r_p_nac>;
+def Hexagon_S2_lsr_r_p_nac:
+  di_SInst_didisi_nac             <"lsr",     int_hexagon_S2_lsr_r_p_nac>;
+
+// STYPE / SHIFT / Shift by register and logical.
+def Hexagon_S2_asl_r_r_and:
+  si_SInst_sisisi_and             <"asl",     int_hexagon_S2_asl_r_r_and>;
+def Hexagon_S2_asr_r_r_and:
+  si_SInst_sisisi_and             <"asr",     int_hexagon_S2_asr_r_r_and>;
+def Hexagon_S2_lsl_r_r_and:
+  si_SInst_sisisi_and             <"lsl",     int_hexagon_S2_lsl_r_r_and>;
+def Hexagon_S2_lsr_r_r_and:
+  si_SInst_sisisi_and             <"lsr",     int_hexagon_S2_lsr_r_r_and>;
+
+def Hexagon_S2_asl_r_r_or:
+  si_SInst_sisisi_or              <"asl",     int_hexagon_S2_asl_r_r_or>;
+def Hexagon_S2_asr_r_r_or:
+  si_SInst_sisisi_or              <"asr",     int_hexagon_S2_asr_r_r_or>;
+def Hexagon_S2_lsl_r_r_or:
+  si_SInst_sisisi_or              <"lsl",     int_hexagon_S2_lsl_r_r_or>;
+def Hexagon_S2_lsr_r_r_or:
+  si_SInst_sisisi_or              <"lsr",     int_hexagon_S2_lsr_r_r_or>;
+
+def Hexagon_S2_asl_r_p_and:
+  di_SInst_didisi_and             <"asl",     int_hexagon_S2_asl_r_p_and>;
+def Hexagon_S2_asr_r_p_and:
+  di_SInst_didisi_and             <"asr",     int_hexagon_S2_asr_r_p_and>;
+def Hexagon_S2_lsl_r_p_and:
+  di_SInst_didisi_and             <"lsl",     int_hexagon_S2_lsl_r_p_and>;
+def Hexagon_S2_lsr_r_p_and:
+  di_SInst_didisi_and             <"lsr",     int_hexagon_S2_lsr_r_p_and>;
+
+def Hexagon_S2_asl_r_p_or:
+  di_SInst_didisi_or              <"asl",     int_hexagon_S2_asl_r_p_or>;
+def Hexagon_S2_asr_r_p_or:
+  di_SInst_didisi_or              <"asr",     int_hexagon_S2_asr_r_p_or>;
+def Hexagon_S2_lsl_r_p_or:
+  di_SInst_didisi_or              <"lsl",     int_hexagon_S2_lsl_r_p_or>;
+def Hexagon_S2_lsr_r_p_or:
+  di_SInst_didisi_or              <"lsr",     int_hexagon_S2_lsr_r_p_or>;
+
+// STYPE / SHIFT / Shift by register with saturation.
+def Hexagon_S2_asl_r_r_sat:
+  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_r_r_sat>;
+def Hexagon_S2_asr_r_r_sat:
+  si_SInst_sisi_sat               <"asr",     int_hexagon_S2_asr_r_r_sat>;
+
+// STYPE / SHIFT / Table Index.
+def HEXAGON_S2_tableidxb_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxb",int_hexagon_S2_tableidxb_goodsyntax>;
+def HEXAGON_S2_tableidxd_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxd",int_hexagon_S2_tableidxd_goodsyntax>;
+def HEXAGON_S2_tableidxh_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxh",int_hexagon_S2_tableidxh_goodsyntax>;
+def HEXAGON_S2_tableidxw_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxw",int_hexagon_S2_tableidxw_goodsyntax>;
+
+
+/********************************************************************
+*            STYPE/VH                                               *
+*********************************************************************/
+
+// STYPE / VH / Vector absolute value halfwords.
+// Rdd64=vabsh(Rss64)
+def Hexagon_A2_vabsh:
+  di_SInst_di                     <"vabsh",   int_hexagon_A2_vabsh>;
+def Hexagon_A2_vabshsat:
+  di_SInst_di_sat                 <"vabsh",   int_hexagon_A2_vabshsat>;
+
+// STYPE / VH / Vector shift halfwords by immediate.
+// Rdd64=v[asl/asr/lsr]h(Rss64,Rt32)
+def Hexagon_S2_asl_i_vh:
+  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_i_vh>;
+def Hexagon_S2_asr_i_vh:
+  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_i_vh>;
+def Hexagon_S2_lsr_i_vh:
+  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_i_vh>;
+
+// STYPE / VH / Vector shift halfwords by register.
+// Rdd64=v[asl/asr/lsl/lsr]w(Rss64,Rt32)
+def Hexagon_S2_asl_r_vh:
+  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_r_vh>;
+def Hexagon_S2_asr_r_vh:
+  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_r_vh>;
+def Hexagon_S2_lsl_r_vh:
+  di_SInst_disi                   <"vlslh",   int_hexagon_S2_lsl_r_vh>;
+def Hexagon_S2_lsr_r_vh:
+  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_r_vh>;
+
+
+/********************************************************************
+*            STYPE/VW                                               *
+*********************************************************************/
+
+// STYPE / VW / Vector absolute value words.
+def Hexagon_A2_vabsw:
+  di_SInst_di                     <"vabsw",   int_hexagon_A2_vabsw>;
+def Hexagon_A2_vabswsat:
+  di_SInst_di_sat                 <"vabsw",   int_hexagon_A2_vabswsat>;
+
+// STYPE / VW / Vector shift words by immediate.
+// Rdd64=v[asl/vsl]w(Rss64,Rt32)
+def Hexagon_S2_asl_i_vw:
+  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_i_vw>;
+def Hexagon_S2_asr_i_vw:
+  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_i_vw>;
+def Hexagon_S2_lsr_i_vw:
+  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_i_vw>;
+
+// STYPE / VW / Vector shift words by register.
+// Rdd64=v[asl/vsl]w(Rss64,Rt32)
+def Hexagon_S2_asl_r_vw:
+  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_r_vw>;
+def Hexagon_S2_asr_r_vw:
+  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_vw>;
+def Hexagon_S2_lsl_r_vw:
+  di_SInst_disi                   <"vlslw",   int_hexagon_S2_lsl_r_vw>;
+def Hexagon_S2_lsr_r_vw:
+  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_r_vw>;
+
+// STYPE / VW / Vector shift words with truncate and pack.
+def Hexagon_S2_asr_r_svw_trun:
+  si_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_svw_trun>;
+def Hexagon_S2_asr_i_svw_trun:
+  si_SInst_diu5                   <"vasrw",   int_hexagon_S2_asr_i_svw_trun>;
+
+include "HexagonIntrinsicsV3.td"
+include "HexagonIntrinsicsV4.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
new file mode 100644
index 0000000..68eaf68
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -0,0 +1,29 @@
+//===-- HexagonIntrinsicsDerived.td - Derived intrinsics ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Multiply 64-bit and use lower result
+//
+// Optimized with intrinisics accumulates
+//
+def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
+      (COMBINE_rr
+                  (Hexagon_M2_maci
+                           (Hexagon_M2_maci (EXTRACT_SUBREG  (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
+                                                           (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)),
+                                       subreg_hireg),
+                                       (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
+                                       (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
+                            (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg),
+                            (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)),
+                    (EXTRACT_SUBREG  (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
+                                      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)),
+                     subreg_loreg))>;
+
+
+
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/lib/Target/Hexagon/HexagonIntrinsicsV3.td
new file mode 100644
index 0000000..2a54e62
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV3.td
@@ -0,0 +1,50 @@
+//=- HexagonIntrinsicsV3.td - Target Description for Hexagon -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+
+// MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
+def Hexagon_M2_vrcmpys_s1:
+  di_MInst_disi_s1_sat            <"vrcmpys",  int_hexagon_M2_vrcmpys_s1>;
+def Hexagon_M2_vrcmpys_acc_s1:
+  di_MInst_didisi_acc_s1_sat      <"vrcmpys",  int_hexagon_M2_vrcmpys_acc_s1>;
+def Hexagon_M2_vrcmpys_s1rp:
+  si_MInst_disi_s1_rnd_sat        <"vrcmpys",  int_hexagon_M2_vrcmpys_s1rp>;
+
+
+
+
+/********************************************************************
+*            MTYPE/VB                                               *
+*********************************************************************/
+
+// MTYPE / VB / Vector reduce add unsigned bytes.
+def Hexagon_M2_vradduh:
+  si_MInst_didi                   <"vradduh",  int_hexagon_M2_vradduh>;
+
+
+/********************************************************************
+*            ALU64/ALU                                              *
+*********************************************************************/
+
+// ALU64 / ALU / Add.
+def Hexagon_A2_addsp:
+  di_ALU64_sidi                   <"add",      int_hexagon_A2_addsp>;
+def Hexagon_A2_addpsat:
+  di_ALU64_didi                   <"add",      int_hexagon_A2_addpsat>;
+
+def Hexagon_A2_maxp:
+  di_ALU64_didi                   <"max",      int_hexagon_A2_maxp>;
+def Hexagon_A2_maxup:
+  di_ALU64_didi                   <"maxu",     int_hexagon_A2_maxup>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
new file mode 100644
index 0000000..dd28ebb
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
@@ -0,0 +1,369 @@
+//===- HexagonIntrinsicsV4.td - V4 Instruction intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V4 Architecture Extensions
+// Application-Level Specification
+// 80-V9418-12 Rev. A
+// June 15, 2010
+
+
+//
+// ALU 32 types.
+//
+
+class si_ALU32_sisi_not<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_ALU32_s8si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
+
+class di_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_neg_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_neg_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_neg_ALU32_siu9<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_neg_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_neg_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+
+//
+// SInst Classes.
+//
+class qi_neg_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_qi_andqiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, !$src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_andqiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, $src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_orqiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, or($src2, !$src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_orqiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, or($src2, $src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class si_SInst_si_addsis6<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, add($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_SInst_si_subs6si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, sub(#$src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_ALU64_didi_neg<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_MInst_dididi_xacc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst ^= ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_and<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst &= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_andn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst &= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_sisis10_andi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s10Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, #$src3))")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_MInst_sisisi_xor<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_xorn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_sisis10_or<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2, s10Imm:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_MInst_sisisi_or<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_orn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_siu5_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+
+/********************************************************************
+*            ALU32/ALU                                              *
+*********************************************************************/
+
+// ALU32 / ALU / Logical Operations.
+def Hexagon_A4_orn  : si_ALU32_sisi_not <"or",  int_hexagon_A4_orn>;
+def Hexagon_A4_andn : si_ALU32_sisi_not <"and", int_hexagon_A4_andn>;
+
+
+/********************************************************************
+*            ALU32/PERM                                             *
+*********************************************************************/
+
+// ALU32 / PERM / Combine Words Into Doublewords.
+def Hexagon_A4_combineir : di_ALU32_s8si  <"combine", int_hexagon_A4_combineir>;
+def Hexagon_A4_combineri : di_ALU32_sis8  <"combine", int_hexagon_A4_combineri>;
+
+
+/********************************************************************
+*            ALU32/PRED                                             *
+*********************************************************************/
+
+// ALU32 / PRED / Conditional Shift Halfword.
+// ALU32 / PRED / Conditional Sign Extend.
+// ALU32 / PRED / Conditional Zero Extend.
+// ALU32 / PRED / Compare.
+def Hexagon_C4_cmpneq  : qi_neg_ALU32_sisi  <"cmp.eq", int_hexagon_C4_cmpneq>;
+def Hexagon_C4_cmpneqi : qi_neg_ALU32_sis10 <"cmp.eq", int_hexagon_C4_cmpneqi>;
+def Hexagon_C4_cmplte  : qi_neg_ALU32_sisi  <"cmp.gt", int_hexagon_C4_cmplte>;
+def Hexagon_C4_cmpltei : qi_neg_ALU32_sis10 <"cmp.gt", int_hexagon_C4_cmpltei>;
+def Hexagon_C4_cmplteu : qi_neg_ALU32_sisi  <"cmp.gtu",int_hexagon_C4_cmplteu>;
+def Hexagon_C4_cmplteui: qi_neg_ALU32_siu9  <"cmp.gtu",int_hexagon_C4_cmplteui>;
+
+// ALU32 / PRED / cmpare To General Register.
+def Hexagon_A4_rcmpneq : si_neg_ALU32_sisi <"cmp.eq", int_hexagon_A4_rcmpneq>;
+def Hexagon_A4_rcmpneqi: si_neg_ALU32_sis8 <"cmp.eq", int_hexagon_A4_rcmpneqi>;
+def Hexagon_A4_rcmpeq  : si_ALU32_sisi     <"cmp.eq", int_hexagon_A4_rcmpeq>;
+def Hexagon_A4_rcmpeqi : si_ALU32_sis8     <"cmp.eq", int_hexagon_A4_rcmpeqi>;
+
+
+/********************************************************************
+*            CR                                                     *
+*********************************************************************/
+
+// CR / Corner Detection Acceleration.
+def Hexagon_C4_fastcorner9:
+  qi_SInst_qiqi<"fastcorner9", int_hexagon_C4_fastcorner9>;
+def Hexagon_C4_fastcorner9_not:
+  qi_neg_SInst_qiqi<"fastcorner9",int_hexagon_C4_fastcorner9_not>;
+
+// CR / Logical Operations On Predicates.
+def Hexagon_C4_and_andn:
+  qi_SInst_qi_andqiqi_neg         <"and",      int_hexagon_C4_and_andn>;
+def Hexagon_C4_and_and:
+  qi_SInst_qi_andqiqi             <"and",      int_hexagon_C4_and_and>;
+def Hexagon_C4_and_orn:
+  qi_SInst_qi_orqiqi_neg          <"and",      int_hexagon_C4_and_orn>;
+def Hexagon_C4_and_or:
+  qi_SInst_qi_orqiqi              <"and",      int_hexagon_C4_and_or>;
+def Hexagon_C4_or_andn:
+  qi_SInst_qi_andqiqi_neg         <"or",       int_hexagon_C4_or_andn>;
+def Hexagon_C4_or_and:
+  qi_SInst_qi_andqiqi             <"or",       int_hexagon_C4_or_and>;
+def Hexagon_C4_or_orn:
+  qi_SInst_qi_orqiqi_neg          <"or",       int_hexagon_C4_or_orn>;
+def Hexagon_C4_or_or:
+  qi_SInst_qi_orqiqi              <"or",       int_hexagon_C4_or_or>;
+
+
+/********************************************************************
+*            XTYPE/ALU                                              *
+*********************************************************************/
+
+// XTYPE / ALU / Add And Accumulate.
+def Hexagon_S4_addaddi:
+  si_SInst_si_addsis6             <"add",      int_hexagon_S4_addaddi>;
+def Hexagon_S4_subaddi:
+  si_SInst_si_subs6si             <"add",      int_hexagon_S4_subaddi>;
+
+// XTYPE / ALU / Logical Doublewords.
+def Hexagon_S4_andnp:
+  di_ALU64_didi_neg               <"and",      int_hexagon_A4_andnp>;
+def Hexagon_S4_ornp:
+  di_ALU64_didi_neg               <"or",       int_hexagon_A4_ornp>;
+
+// XTYPE / ALU / Logical-logical Doublewords.
+def Hexagon_M4_xor_xacc:
+  di_MInst_dididi_xacc            <"xor",      int_hexagon_M4_xor_xacc>;
+
+// XTYPE / ALU / Logical-logical Words.
+def HEXAGON_M4_and_and:
+  si_MInst_sisisi_and             <"and",      int_hexagon_M4_and_and>;
+def HEXAGON_M4_and_or:
+  si_MInst_sisisi_and             <"or",       int_hexagon_M4_and_or>;
+def HEXAGON_M4_and_xor:
+  si_MInst_sisisi_and             <"xor",      int_hexagon_M4_and_xor>;
+def HEXAGON_M4_and_andn:
+  si_MInst_sisisi_andn            <"and",      int_hexagon_M4_and_andn>;
+def HEXAGON_M4_xor_and:
+  si_MInst_sisisi_xor             <"and",      int_hexagon_M4_xor_and>;
+def HEXAGON_M4_xor_or:
+  si_MInst_sisisi_xor             <"or",       int_hexagon_M4_xor_or>;
+def HEXAGON_M4_xor_andn:
+  si_MInst_sisisi_xorn            <"and",      int_hexagon_M4_xor_andn>;
+def HEXAGON_M4_or_and:
+  si_MInst_sisisi_or              <"and",      int_hexagon_M4_or_and>;
+def HEXAGON_M4_or_or:
+  si_MInst_sisisi_or              <"or",       int_hexagon_M4_or_or>;
+def HEXAGON_M4_or_xor:
+  si_MInst_sisisi_or              <"xor",      int_hexagon_M4_or_xor>;
+def HEXAGON_M4_or_andn:
+  si_MInst_sisisi_orn             <"and",      int_hexagon_M4_or_andn>;
+def HEXAGON_S4_or_andix:
+  si_SInst_sisis10_andi           <"or",       int_hexagon_S4_or_andix>;
+def HEXAGON_S4_or_andi:
+  si_SInst_sisis10_or             <"and",      int_hexagon_S4_or_andi>;
+def HEXAGON_S4_or_ori:
+  si_SInst_sisis10_or             <"or",       int_hexagon_S4_or_ori>;
+
+// XTYPE / ALU / Modulo wrap.
+def HEXAGON_A4_modwrapu:
+  si_ALU64_sisi                   <"modwrap",  int_hexagon_A4_modwrapu>;
+
+// XTYPE / ALU / Round.
+def HEXAGON_A4_cround_ri:
+  si_SInst_siu5                   <"cround",   int_hexagon_A4_cround_ri>;
+def HEXAGON_A4_cround_rr:
+  si_SInst_sisi                   <"cround",   int_hexagon_A4_cround_rr>;
+def HEXAGON_A4_round_ri:
+  si_SInst_siu5                   <"round",    int_hexagon_A4_round_ri>;
+def HEXAGON_A4_round_rr:
+  si_SInst_sisi                   <"round",    int_hexagon_A4_round_rr>;
+def HEXAGON_A4_round_ri_sat:
+  si_SInst_siu5_sat               <"round",    int_hexagon_A4_round_ri_sat>;
+def HEXAGON_A4_round_rr_sat:
+  si_SInst_sisi_sat               <"round",    int_hexagon_A4_round_rr_sat>;
+
+// XTYPE / ALU / Vector reduce add unsigned halfwords.
+// XTYPE / ALU / Vector add bytes.
+// XTYPE / ALU / Vector conditional negate.
+// XTYPE / ALU / Vector maximum bytes.
+// XTYPE / ALU / Vector reduce maximum halfwords.
+// XTYPE / ALU / Vector reduce maximum words.
+// XTYPE / ALU / Vector minimum bytes.
+// XTYPE / ALU / Vector reduce minimum halfwords.
+// XTYPE / ALU / Vector reduce minimum words.
+// XTYPE / ALU / Vector subtract bytes.
+
+
+/********************************************************************
+*            XTYPE/BIT                                              *
+*********************************************************************/
+
+// XTYPE / BIT / Count leading.
+// XTYPE / BIT / Count trailing.
+// XTYPE / BIT / Extract bitfield.
+// XTYPE / BIT / Masked parity.
+// XTYPE / BIT / Bit reverse.
+// XTYPE / BIT / Split bitfield.
+
+
+/********************************************************************
+*            XTYPE/COMPLEX                                          *
+*********************************************************************/
+
+// XTYPE / COMPLEX / Complex add/sub halfwords.
+// XTYPE / COMPLEX / Complex add/sub words.
+// XTYPE / COMPLEX / Complex multiply 32x16.
+// XTYPE / COMPLEX / Vector reduce complex rotate.
+
+
+/********************************************************************
+*            XTYPE/MPY                                              *
+*********************************************************************/
+
+// XTYPE / COMPLEX / Complex add/sub halfwords.
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
new file mode 100644
index 0000000..0318c51
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -0,0 +1,75 @@
+//=- HexagonMachineFuctionInfo.h - Hexagon machine function info --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonMACHINEFUNCTIONINFO_H
+#define HexagonMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+  namespace Hexagon {
+    const unsigned int StartPacket = 0x1;
+    const unsigned int EndPacket = 0x2;
+  }
+
+
+/// Hexagon target-specific information for each MachineFunction.
+class HexagonMachineFunctionInfo : public MachineFunctionInfo {
+  // SRetReturnReg - Some subtargets require that sret lowering includes
+  // returning the value of the returned struct in a register. This field
+  // holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+  std::vector<MachineInstr*> AllocaAdjustInsts;
+  int VarArgsFrameIndex;
+  bool HasClobberLR;
+
+  std::map<const MachineInstr*, unsigned> PacketInfo;
+
+
+public:
+  HexagonMachineFunctionInfo() : SRetReturnReg(0), HasClobberLR(0) {}
+
+  HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
+                                                    HasClobberLR(0) {}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  void addAllocaAdjustInst(MachineInstr* MI) {
+    AllocaAdjustInsts.push_back(MI);
+  }
+  const std::vector<MachineInstr*>& getAllocaAdjustInsts() {
+    return AllocaAdjustInsts;
+  }
+
+  void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
+  int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
+
+  void setStartPacket(MachineInstr* MI) {
+    PacketInfo[MI] |= Hexagon::StartPacket;
+  }
+  void setEndPacket(MachineInstr* MI)   {
+    PacketInfo[MI] |= Hexagon::EndPacket;
+  }
+  bool isStartPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::StartPacket));
+  }
+  bool isEndPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::EndPacket));
+  }
+  void setHasClobberLR(bool v) { HasClobberLR = v;  }
+  bool hasClobberLR() const { return HasClobberLR; }
+
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonOptimizeSZExtends.cpp b/lib/Target/Hexagon/HexagonOptimizeSZExtends.cpp
new file mode 100644
index 0000000..1229aca
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonOptimizeSZExtends.cpp
@@ -0,0 +1,129 @@
+//===-- HexagonOptimizeSZExtends.cpp - Identify and remove sign and -------===//
+//===--                                zero extends.                -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+  struct HexagonOptimizeSZExtends : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    HexagonOptimizeSZExtends() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const {
+      return "Hexagon remove redundant zero and size extends";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineFunctionAnalysis>();
+      AU.addPreserved<MachineFunctionAnalysis>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+  };
+}
+
+char HexagonOptimizeSZExtends::ID = 0;
+
+// This is a brain dead pass to get rid of redundant sign extends for the
+// following case:
+//
+// Transform the following pattern
+// %vreg170<def> = SXTW %vreg166
+// ...
+// %vreg176<def> = COPY %vreg170:subreg_loreg
+//
+// Into
+// %vreg176<def> = COPY vreg166
+
+bool HexagonOptimizeSZExtends::runOnMachineFunction(MachineFunction &MF) {
+  DenseMap<unsigned, unsigned> SExtMap;
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    SExtMap.clear();
+
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = MII;
+      // Look for sign extends:
+      //   %vreg170<def> = SXTW %vreg166
+      if (MI->getOpcode() == Hexagon::SXTW) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        // Just handle virtual registers.
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Map the following:
+          //  %vreg170<def> = SXTW %vreg166
+          //  SExtMap[170] = vreg166
+          SExtMap[DstReg] = SrcReg;
+        }
+      }
+      // Look for copy:
+      //   %vreg176<def> = COPY %vreg170:subreg_loreg
+      if (MI->isCopy()) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+
+        // Make sure we are copying the lower 32 bits.
+        if (Src.getSubReg() != Hexagon::subreg_loreg)
+          continue;
+
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Try to find in the map.
+          if (unsigned SextSrc = SExtMap.lookup(SrcReg)) {
+            // Change the 1st operand.
+            MI->RemoveOperand(1);
+            MI->addOperand(MachineOperand::CreateReg(SextSrc, false));
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+FunctionPass *llvm::createHexagonOptimizeSZExtends() {
+  return new HexagonOptimizeSZExtends();
+}
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
new file mode 100644
index 0000000..521e0c1
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -0,0 +1,323 @@
+//==- HexagonRegisterInfo.cpp - Hexagon Register Information -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <iostream>
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Function.h"
+using namespace llvm;
+
+
+HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st,
+                                     const HexagonInstrInfo &tii)
+  : HexagonGenRegisterInfo(Hexagon::R31),
+    Subtarget(st),
+   TII(tii) {
+}
+
+const unsigned* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
+                                                        *MF)
+  const {
+  static const unsigned CalleeSavedRegsV2[] = {
+  Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+  static const unsigned CalleeSavedRegsV3[] = {
+    Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
+    Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+
+  switch(Subtarget.getHexagonArchVersion()) {
+  case HexagonSubtarget::V2:
+    return CalleeSavedRegsV2;
+    break;
+  case HexagonSubtarget::V3:
+  case HexagonSubtarget::V4:
+    return CalleeSavedRegsV3;
+    break;
+  default:
+    const char *ErrorString = 
+      "Callee saved registers requested for unknown archtecture version";
+    llvm_unreachable(ErrorString);
+  }
+}
+
+BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
+  const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(HEXAGON_RESERVED_REG_1);
+  Reserved.set(HEXAGON_RESERVED_REG_2);
+  Reserved.set(Hexagon::R29);
+  Reserved.set(Hexagon::R30);
+  Reserved.set(Hexagon::R31);
+  Reserved.set(Hexagon::D14);
+  Reserved.set(Hexagon::D15);
+  Reserved.set(Hexagon::LC0);
+  Reserved.set(Hexagon::LC1);
+  Reserved.set(Hexagon::SA0);
+  Reserved.set(Hexagon::SA1);
+  return Reserved;
+}
+
+
+const TargetRegisterClass* const*
+HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClassesV2[] = {
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    };
+  static const TargetRegisterClass * const CalleeSavedRegClassesV3[] = {
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+  };
+
+  switch(Subtarget.getHexagonArchVersion()) {
+  case HexagonSubtarget::V2:
+    return CalleeSavedRegClassesV2;
+    break;
+  case HexagonSubtarget::V3:
+  case HexagonSubtarget::V4:
+    return CalleeSavedRegClassesV3;
+    break;
+  default:
+    const char *ErrorString = 
+      "Callee saved register classes requested for unknown archtecture version";
+    llvm_unreachable(ErrorString);
+  }
+}
+
+void HexagonRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+
+  if (MI.getOpcode() == Hexagon::ADJCALLSTACKDOWN) {
+    // Hexagon_TODO: add code
+  } else if (MI.getOpcode() == Hexagon::ADJCALLSTACKUP) {
+    // Hexagon_TODO: add code
+  } else {
+    assert(0 && "Cannot handle this call frame pseudo instruction");
+  }
+  MBB.erase(I);
+}
+
+void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, RegScavenger *RS) const {
+
+  //
+  // Hexagon_TODO: Do we need to enforce this for Hexagon?
+  assert(SPAdj == 0 && "Unexpected");
+
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp.
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  unsigned FrameReg = getFrameRegister(MF);
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasFP(MF)) {
+    // We will not reserve space on the stack for the lr and fp registers.
+    Offset -= 2 * Hexagon_WordSize;
+  }
+
+  const unsigned FrameSize = MFI.getStackSize();
+
+  if (!MFI.hasVarSizedObjects() &&
+      TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset)) &&
+      !TII.isSpillPredRegOp(&MI)) {
+    // Replace frame index with a stack pointer reference.
+    MI.getOperand(i).ChangeToRegister(getStackRegister(), false, false, true);
+    MI.getOperand(i+1).ChangeToImmediate(FrameSize+Offset);
+  } else {
+    // Replace frame index with a frame pointer reference.
+    if (!TII.isValidOffset(MI.getOpcode(), Offset)) {
+
+      // If the offset overflows, then correct it.
+      //
+      // For loads, we do not need a reserved register
+      // r0 = memw(r30 + #10000) to:
+      //
+      // r0 = add(r30, #10000)
+      // r0 = memw(r0)
+      if ( (MI.getOpcode() == Hexagon::LDriw)  ||
+           (MI.getOpcode() == Hexagon::LDrid) ||
+           (MI.getOpcode() == Hexagon::LDrih) ||
+           (MI.getOpcode() == Hexagon::LDriuh) ||
+           (MI.getOpcode() == Hexagon::LDrib) ||
+           (MI.getOpcode() == Hexagon::LDriub) ) {
+        unsigned dstReg = (MI.getOpcode() == Hexagon::LDrid) ?
+          *getSubRegisters(MI.getOperand(0).getReg()) :
+          MI.getOperand(0).getReg();
+
+        // Check if offset can fit in addi.
+        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_rr),
+                  dstReg).addReg(FrameReg).addReg(dstReg);
+        } else {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_ri),
+                  dstReg).addReg(FrameReg).addImm(Offset);
+        }
+
+        MI.getOperand(i).ChangeToRegister(dstReg, false, false, true);
+        MI.getOperand(i+1).ChangeToImmediate(0);
+      } else if ((MI.getOpcode() == Hexagon::STriw) ||
+                 (MI.getOpcode() == Hexagon::STrid) ||
+                 (MI.getOpcode() == Hexagon::STrih) ||
+                 (MI.getOpcode() == Hexagon::STrib) ||
+                 (MI.getOpcode() == Hexagon::STriwt)) {
+        // For stores, we need a reserved register. Change
+        // memw(r30 + #10000) = r0 to:
+        //
+        // rs = add(r30, #10000);
+        // memw(rs) = r0
+        unsigned resReg = HEXAGON_RESERVED_REG_1;
+
+        // Check if offset can fit in addi.
+        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::CONST32_Int_Real), resReg).addImm(Offset);
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_rr),
+                  resReg).addReg(FrameReg).addReg(resReg);
+        } else {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_ri),
+                  resReg).addReg(FrameReg).addImm(Offset);
+        }
+        MI.getOperand(i).ChangeToRegister(resReg, false, false, true);
+        MI.getOperand(i+1).ChangeToImmediate(0);
+      } else if (TII.isMemOp(&MI)) {
+        unsigned resReg = HEXAGON_RESERVED_REG_1;
+        if (!MFI.hasVarSizedObjects() &&
+            TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset))) {
+          MI.getOperand(i).ChangeToRegister(getStackRegister(), false, false,
+                                            true);
+          MI.getOperand(i+1).ChangeToImmediate(FrameSize+Offset);
+        } else if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::CONST32_Int_Real), resReg).addImm(Offset);
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_rr),
+                  resReg).addReg(FrameReg).addReg(resReg);
+          MI.getOperand(i).ChangeToRegister(resReg, false, false, true);
+          MI.getOperand(i+1).ChangeToImmediate(0);
+        } else {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_ri),
+                  resReg).addReg(FrameReg).addImm(Offset);
+          MI.getOperand(i).ChangeToRegister(resReg, false, false, true);
+          MI.getOperand(i+1).ChangeToImmediate(0);
+        }
+      } else {
+        unsigned dstReg = MI.getOperand(0).getReg();
+        BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
+        BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                TII.get(Hexagon::ADD_rr),
+                dstReg).addReg(FrameReg).addReg(dstReg);
+        // Can we delete MI??? r2 = add (r2, #0).
+        MI.getOperand(i).ChangeToRegister(dstReg, false, false, true);
+        MI.getOperand(i+1).ChangeToImmediate(0);
+      }
+    } else {
+      // If the offset is small enough to fit in the immediate field, directly
+      // encode it.
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(i+1).ChangeToImmediate(Offset);
+    }
+  }
+
+}
+
+unsigned HexagonRegisterInfo::getRARegister() const {
+  return Hexagon::R31;
+}
+
+unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
+                                               &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (TFI->hasFP(MF)) {
+    return Hexagon::R30;
+  }
+
+  return Hexagon::R29;
+}
+
+unsigned HexagonRegisterInfo::getFrameRegister() const {
+  return Hexagon::R30;
+}
+
+unsigned HexagonRegisterInfo::getStackRegister() const {
+  return Hexagon::R29;
+}
+
+void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove>
+                                               &Moves)  const
+{
+  // VirtualFP = (R30 + #0).
+  unsigned FPReg = getFrameRegister();
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(FPReg, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned HexagonRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
new file mode 100644
index 0000000..33b0c14
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -0,0 +1,89 @@
+//==- HexagonRegisterInfo.h - Hexagon Register Information Impl --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonREGISTERINFO_H
+#define HexagonREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#define GET_REGINFO_HEADER
+#include "HexagonGenRegisterInfo.inc"
+#include "llvm/MC/MachineLocation.h"
+
+//
+//  We try not to hard code the reserved registers in our code,
+//  so the following two macros were defined. However, there
+//  are still a few places that R11 and R10 are hard wired.
+//  See below. If, in the future, we decided to change the reserved
+//  register. Don't forget changing the following places.
+//
+//  1. the "Defs" set of STriw_pred in HexagonInstrInfo.td
+//  2. the "Defs" set of LDri_pred in HexagonInstrInfo.td
+//  3. the definition of "IntRegs" in HexagonRegisterInfo.td
+//  4. the definition of "DoubleRegs" in HexagonRegisterInfo.td
+//
+#define HEXAGON_RESERVED_REG_1 Hexagon::R10
+#define HEXAGON_RESERVED_REG_2 Hexagon::R11
+
+namespace llvm {
+
+class HexagonSubtarget;
+class HexagonInstrInfo;
+class Type;
+
+struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
+  HexagonSubtarget &Subtarget;
+  const HexagonInstrInfo &TII;
+
+  HexagonRegisterInfo(HexagonSubtarget &st, const HexagonInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  /// determineFrameLayout - Determine the size of the frame and maximum call
+  /// frame size.
+  void determineFrameLayout(MachineFunction &MF) const;
+
+  /// requiresRegisterScavenging - returns true since we may need scavenging for
+  /// a temporary register when generating hardware loop instructions.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister() const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+  unsigned getStackRegister() const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
new file mode 100644
index 0000000..c05f844
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -0,0 +1,169 @@
+//===- HexagonRegisterInfo.td - Hexagon Register defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Hexagon register file.
+//===----------------------------------------------------------------------===//
+
+class HexagonReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Hexagon";
+}
+
+class HexagonDoubleReg<string n, list<Register> subregs> :
+        RegisterWithSubRegs<n, subregs> {
+  field bits<5> Num;
+  let Namespace = "Hexagon";
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers.
+class Ri<bits<5> num, string n> : HexagonReg<n> {
+  let Num = num;
+}
+
+// Rf - 32-bit floating-point registers.
+class Rf<bits<5> num, string n> : HexagonReg<n> {
+  let Num = num;
+}
+
+
+// Rd - 64 bit registers.
+class Rd<bits<5> num, string n, list<Register> subregs> :
+HexagonDoubleReg<n, subregs> {
+  let Num = num;
+  let SubRegs = subregs;
+}
+
+
+class Rp<bits<5> num, string n> : HexagonReg<n> {
+   let Num = num;
+}
+
+class Rc<bits<5> num, string n> : HexagonReg<n> {
+  let Num = num;
+}
+
+let Namespace = "Hexagon" in {
+
+  def subreg_loreg  : SubRegIndex;
+  def subreg_hireg  : SubRegIndex;
+
+  // Integer registers.
+  def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
+  def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
+  def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
+  def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
+  def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
+  def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
+  def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
+  def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
+  def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
+  def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
+  def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+  def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+  def R12 : Ri<12, "r12">, DwarfRegNum<[12]>;
+  def R13 : Ri<13, "r13">, DwarfRegNum<[13]>;
+  def R14 : Ri<14, "r14">, DwarfRegNum<[14]>;
+  def R15 : Ri<15, "r15">, DwarfRegNum<[15]>;
+  def R16 : Ri<16, "r16">, DwarfRegNum<[16]>;
+  def R17 : Ri<17, "r17">, DwarfRegNum<[17]>;
+  def R18 : Ri<18, "r18">, DwarfRegNum<[18]>;
+  def R19 : Ri<19, "r19">, DwarfRegNum<[19]>;
+  def R20 : Ri<20, "r20">, DwarfRegNum<[20]>;
+  def R21 : Ri<21, "r21">, DwarfRegNum<[21]>;
+  def R22 : Ri<22, "r22">, DwarfRegNum<[22]>;
+  def R23 : Ri<23, "r23">, DwarfRegNum<[23]>;
+  def R24 : Ri<24, "r24">, DwarfRegNum<[24]>;
+  def R25 : Ri<25, "r25">, DwarfRegNum<[25]>;
+  def R26 : Ri<26, "r26">, DwarfRegNum<[26]>;
+  def R27 : Ri<27, "r27">, DwarfRegNum<[27]>;
+  def R28 : Ri<28, "r28">, DwarfRegNum<[28]>;
+  def R29 : Ri<29, "r29">, DwarfRegNum<[29]>;
+  def R30 : Ri<30, "r30">, DwarfRegNum<[30]>;
+  def R31 : Ri<31, "r31">, DwarfRegNum<[31]>;
+
+
+  def PC : Ri<31, "r31">, DwarfRegNum<[32]>;
+  def GP : Ri<31, "r31">, DwarfRegNum<[33]>;
+
+  // Aliases of the R* registers used to hold 64-bit int values (doubles).
+  let SubRegIndices = [subreg_loreg, subreg_hireg] in {
+  def D0  : Rd< 0,  "r1:0", [R0,   R1]>, DwarfRegNum<[32]>;
+  def D1  : Rd< 2,  "r3:2", [R2,   R3]>, DwarfRegNum<[34]>;
+  def D2  : Rd< 4,  "r5:4", [R4,   R5]>, DwarfRegNum<[36]>;
+  def D3  : Rd< 6,  "r7:6", [R6,   R7]>, DwarfRegNum<[38]>;
+  def D4  : Rd< 8,  "r9:8", [R8,   R9]>, DwarfRegNum<[40]>;
+  def D5  : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
+  def D6  : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
+  def D7  : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
+  def D8  : Rd<16, "r17:16", [R16, R17]>, DwarfRegNum<[48]>;
+  def D9  : Rd<18, "r19:18", [R18, R19]>, DwarfRegNum<[50]>;
+  def D10 : Rd<20, "r21:20", [R20, R21]>, DwarfRegNum<[52]>;
+  def D11 : Rd<22, "r23:22", [R22, R23]>, DwarfRegNum<[54]>;
+  def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
+  def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
+  def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
+  def D15 : Rd<30, "r31:30", [R30, R31]>, DwarfRegNum<[62]>;
+  }
+
+  // Predicate registers.
+  def P0 : Rp< 0, "p0">, DwarfRegNum<[63]>;
+  def P1 : Rp< 0, "p1">, DwarfRegNum<[64]>;
+  def P2 : Rp< 0, "p2">, DwarfRegNum<[65]>;
+  def P3 : Rp< 0, "p3">, DwarfRegNum<[66]>;
+
+  // Control registers.
+  def SA0 : Rc<0, "sa0">, DwarfRegNum<[67]>;
+  def LC0 : Rc<0, "lc0">, DwarfRegNum<[68]>;
+
+  def SA1 : Rc<0, "sa1">, DwarfRegNum<[69]>;
+  def LC1 : Rc<0, "lc1">, DwarfRegNum<[70]>;
+}
+
+
+
+
+
+
+
+
+
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"Hexagon", [i32], 32, (add (sequence "R%u", 0, 9),
+                                                       (sequence "R%u", 12, 28),
+                                                        R10, R11, R29, R30,
+                                                        R31)> {
+}
+
+
+
+def DoubleRegs : RegisterClass<"Hexagon", [i64], 64, (add (sequence "D%u", 0,
+                                                           4),
+                                                    (sequence "D%u", 6, 13),
+                                                    D5, D14, D15)> {
+  let SubRegClasses = [(IntRegs subreg_loreg, subreg_hireg)];
+}
+
+
+def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
+{
+  let Size = 32;
+}
+
+def CRRegs : RegisterClass<"Hexagon", [i32], 32, (add (sequence "LC%u", 0, 1),
+                                                      (sequence "SA%u", 0, 1),
+                                                      PC)> {
+  let Size = 32;
+}
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
new file mode 100644
index 0000000..3ca257f
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -0,0 +1,85 @@
+//=- HexagonRemoveExtendArgs.cpp - Remove unecessary argument sign extends --=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass that removes sign extends for function parameters. These parameters
+// are already sign extended by the caller per Hexagon's ABI
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "HexagonTargetMachine.h"
+#include <iostream>
+
+using namespace llvm;
+namespace {
+  struct HexagonRemoveExtendArgs : public FunctionPass {
+  public:
+    static char ID;
+    HexagonRemoveExtendArgs() : FunctionPass(ID) {}
+    virtual bool runOnFunction(Function &F);
+
+    const char *getPassName() const {
+      return "Remove sign extends";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineFunctionAnalysis>();
+      AU.addPreserved<MachineFunctionAnalysis>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char HexagonRemoveExtendArgs::ID = 0;
+RegisterPass<HexagonRemoveExtendArgs> X("reargs",
+                                        "Remove Sign and Zero Extends for Args"
+                                        );
+
+
+
+bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
+  unsigned Idx = 1;
+  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
+       ++AI, ++Idx) {
+    if (F.paramHasAttr(Idx, Attribute::SExt)) {
+      Argument* Arg = AI;
+      if (!isa<PointerType>(Arg->getType())) {
+        for (Instruction::use_iterator UI = Arg->use_begin();
+             UI != Arg->use_end();) {
+          if (isa<SExtInst>(*UI)) {
+            Instruction* Use = cast<Instruction>(*UI);
+            SExtInst* SI = new SExtInst(Arg, Use->getType());
+            assert (EVT::getEVT(SI->getType()) ==
+                    (EVT::getEVT(Use->getType())));
+            ++UI;
+            Use->replaceAllUsesWith(SI);
+            Instruction* First = F.getEntryBlock().begin();
+            SI->insertBefore(First);
+            Use->eraseFromParent();
+          } else {
+            ++UI;
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+
+
+FunctionPass *llvm::createHexagonRemoveExtendOps(HexagonTargetMachine &TM) {
+  return new HexagonRemoveExtendArgs();
+}
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
new file mode 100644
index 0000000..427d1cb
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -0,0 +1,53 @@
+//===-HexagonSchedule.td - Hexagon Scheduling Definitions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Functional Units
+def LUNIT     : FuncUnit;
+def LSUNIT    : FuncUnit;
+def MUNIT     : FuncUnit;
+def SUNIT     : FuncUnit;
+
+
+// Itinerary classes
+def ALU32     : InstrItinClass;
+def ALU64     : InstrItinClass;
+def CR        : InstrItinClass;
+def J         : InstrItinClass;
+def JR        : InstrItinClass;
+def LD        : InstrItinClass;
+def M         : InstrItinClass;
+def ST        : InstrItinClass;
+def S         : InstrItinClass;
+def PSEUDO    : InstrItinClass;
+
+
+def HexagonItineraries :
+ ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [
+  InstrItinData<ALU32      , [InstrStage<1,  [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+  InstrItinData<ALU64      , [InstrStage<1,  [MUNIT, SUNIT]>]>,
+  InstrItinData<CR         , [InstrStage<1,  [SUNIT]>]>,
+  InstrItinData<J          , [InstrStage<1,  [SUNIT, MUNIT]>]>,
+  InstrItinData<JR         , [InstrStage<1,  [MUNIT]>]>,
+  InstrItinData<LD         , [InstrStage<1,  [LUNIT, LSUNIT]>]>,
+  InstrItinData<M          , [InstrStage<1,  [MUNIT, SUNIT]>]>,
+  InstrItinData<ST         , [InstrStage<1,  [LSUNIT]>]>,
+  InstrItinData<S          , [InstrStage<1,  [SUNIT, MUNIT]>]>,
+  InstrItinData<PSEUDO     , [InstrStage<1,  [LUNIT, LSUNIT, MUNIT, SUNIT]>]>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info -
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
new file mode 100644
index 0000000..4cf66fe
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -0,0 +1,56 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+
+
+// Functional Units.
+def SLOT0       : FuncUnit;
+def SLOT1       : FuncUnit;
+def SLOT2       : FuncUnit;
+def SLOT3       : FuncUnit;
+
+// Itinerary classes.
+def NV_V4       : InstrItinClass;
+def MEM_V4      : InstrItinClass;
+// ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+
+def HexagonItinerariesV4 : ProcessorItineraries<
+                    [SLOT0, SLOT1, SLOT2, SLOT3], [], [
+  InstrItinData<LD            , [InstrStage<1,  [SLOT0, SLOT1]>]>,
+  InstrItinData<ST            , [InstrStage<1,  [SLOT0, SLOT1]>]>,
+  InstrItinData<ALU32         , [InstrStage<1,  [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+  InstrItinData<NV_V4         , [InstrStage<1,  [SLOT0]>]>,
+  InstrItinData<MEM_V4        , [InstrStage<1,  [SLOT0]>]>,
+  InstrItinData<J             , [InstrStage<1,  [SLOT2, SLOT3]>]>,
+  InstrItinData<JR            , [InstrStage<1,  [SLOT2]>]>,
+  InstrItinData<CR            , [InstrStage<1,  [SLOT3]>]>,
+  InstrItinData<PSEUDO        , [InstrStage<1,  [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+  InstrItinData<ALU64         , [InstrStage<1,  [SLOT2, SLOT3]>]>,
+  InstrItinData<M             , [InstrStage<1,  [SLOT2, SLOT3]>]>,
+  InstrItinData<S             , [InstrStage<1,  [SLOT2, SLOT3]>]>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSelectCCInfo.td b/lib/Target/Hexagon/HexagonSelectCCInfo.td
new file mode 100644
index 0000000..f21d928
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSelectCCInfo.td
@@ -0,0 +1,121 @@
+//=-HexagoSelectCCInfo.td - Selectcc mappings ----------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+//
+// selectcc mappings.
+//
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETEQ)),
+      (i32 (MUX_rr (i1 (CMPEQrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETNE)),
+      (i32 (MUX_rr (i1 (NOT_Ps (CMPEQrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETGT)),
+      (i32 (MUX_rr (i1 (CMPGTrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETUGT)),
+      (i32 (MUX_rr (i1 (CMPGTUrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETULT)),
+      (i32 (MUX_rr (i1 (NOT_Ps (CMPGTUrr IntRegs:$lhs,
+                                         (ADD_ri IntRegs:$rhs, -1)))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETLT)),
+      (i32 (MUX_rr (i1 (NOT_Ps (CMPGTrr IntRegs:$lhs,
+                                        (ADD_ri IntRegs:$rhs, -1)))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETLE)),
+      (i32 (MUX_rr (i1 (NOT_Ps (CMPGTrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETULE)),
+      (i32 (MUX_rr (i1 (NOT_Ps (CMPGTUrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+//
+// selectcc mappings for greater-equal-to Rs => greater-than Rs-1.
+//
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETGE)),
+      (i32 (MUX_rr (i1 (CMPGTrr IntRegs:$lhs, (ADD_ri IntRegs:$rhs, -1))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETUGE)),
+      (i32 (MUX_rr (i1 (CMPGTUrr IntRegs:$lhs, (ADD_ri IntRegs:$rhs, -1))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+
+//
+// selectcc mappings for predicate comparisons.
+//
+// Convert Rd = selectcc(p0, p1, true_val, false_val, SETEQ) into:
+//  pt = not(p1 xor p2)
+//  Rd = mux(pt, true_val, false_val)
+// and similarly for SETNE
+//
+def : Pat <(i32 (selectcc PredRegs:$lhs, PredRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETNE)),
+      (i32 (MUX_rr (i1 (XOR_pp PredRegs:$lhs, PredRegs:$rhs)), IntRegs:$tval,
+                   IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc PredRegs:$lhs, PredRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETEQ)),
+      (i32 (MUX_rr (i1 (NOT_pp (XOR_pp PredRegs:$lhs, PredRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+//
+// selectcc mappings for 64-bit operands are messy. Hexagon does not have a
+// MUX64 o, use this:
+// selectcc(Rss, Rdd, tval, fval, cond) ->
+//   combine(mux(cmp_cond(Rss, Rdd), tval.hi, fval.hi),
+//           mux(cmp_cond(Rss, Rdd), tval.lo, fval.lo))
+
+// setgt-64.
+def : Pat<(i64 (selectcc DoubleRegs:$lhs, DoubleRegs:$rhs, DoubleRegs:$tval,
+                         DoubleRegs:$fval, SETGT)),
+      (COMBINE_rr (MUX_rr (CMPGT64rr DoubleRegs:$lhs, DoubleRegs:$rhs),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_hireg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_hireg)),
+                   (MUX_rr (CMPGT64rr DoubleRegs:$lhs, DoubleRegs:$rhs),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_loreg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_loreg)))>;
+
+
+// setlt-64 -> setgt-64.
+def : Pat<(i64 (selectcc DoubleRegs:$lhs, DoubleRegs:$rhs, DoubleRegs:$tval,
+                         DoubleRegs:$fval, SETLT)),
+      (COMBINE_rr (MUX_rr (CMPGT64rr DoubleRegs:$lhs,
+                                     (ADD64_rr DoubleRegs:$rhs, (TFRI64 -1))),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_hireg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_hireg)),
+                   (MUX_rr (CMPGT64rr DoubleRegs:$lhs,
+                                      (ADD64_rr DoubleRegs:$rhs, (TFRI64 -1))),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_loreg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_loreg)))>;
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
new file mode 100644
index 0000000..a52c604
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -0,0 +1,46 @@
+//===-- HexagonSelectionDAGInfo.cpp - Hexagon SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the HexagonSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-selectiondag-info"
+#include "HexagonTargetMachine.h"
+using namespace llvm;
+
+bool llvm::flag_aligned_memcpy;
+
+HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine
+                                                 &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
+}
+
+SDValue
+HexagonSelectionDAGInfo::
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Chain,
+                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
+                        bool isVolatile, bool AlwaysInline,
+                        MachinePointerInfo DstPtrInfo,
+                        MachinePointerInfo SrcPtrInfo) const {
+  flag_aligned_memcpy = false;
+  if ((Align & 0x3) == 0) {
+    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+    if (ConstantSize) {
+      uint64_t SizeVal = ConstantSize->getZExtValue();
+      if ((SizeVal > 32) && ((SizeVal % 8) == 0))
+        flag_aligned_memcpy = true;
+    }
+  }
+
+  return SDValue();
+}
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
new file mode 100644
index 0000000..86fa026
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -0,0 +1,40 @@
+//=-- HexagonSelectionDAGInfo.h - Hexagon SelectionDAG Info ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Hexagon subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonSELECTIONDAGINFO_H
+#define HexagonSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class HexagonTargetMachine;
+
+class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit HexagonSelectionDAGInfo(const HexagonTargetMachine &TM);
+  ~HexagonSelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
new file mode 100644
index 0000000..f4d3647
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -0,0 +1,136 @@
+//===---- HexagonSplitTFRCondSets.cpp - split TFR condsets into xfers -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===////
+// This pass tries to provide opportunities for better optimization of muxes.
+// The default code generated for something like: flag = (a == b) ? 1 : 3;
+// would be:
+//
+//   {p0 = cmp.eq(r0,r1)}
+//   {r3 = mux(p0,#1,#3)}
+//
+// This requires two packets.  If we use .new predicated immediate transfers, 
+// then we can do this in a single packet, e.g.:
+//
+//   {p0 = cmp.eq(r0,r1)
+//    if (p0.new) r3 = #1
+//    if (!p0.new) r3 = #3}
+//
+// Note that the conditional assignments are not generated in .new form here.
+// We assume opptimisically that they will be formed later.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xfer"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include <map>
+#include <iostream>
+
+#include "llvm/Support/CommandLine.h"
+#define DEBUG_TYPE "xfer"
+
+
+using namespace llvm;
+
+namespace {
+
+class HexagonSplitTFRCondSets : public MachineFunctionPass {
+    HexagonTargetMachine& QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonSplitTFRCondSets(HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+
+    const char *getPassName() const {
+      return "Hexagon Split TFRCondSets";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonSplitTFRCondSets::ID = 0;
+
+
+bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
+
+  const TargetInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (Opc == Hexagon::TFR_condset_rr) {
+
+        int DestReg = MI->getOperand(0).getReg();
+        int SrcReg1 = MI->getOperand(2).getReg();
+        int SrcReg2 = MI->getOperand(3).getReg();
+
+        // Minor optimization: do not emit the predicated copy if the source and
+        // the destination is the same register
+        if (DestReg != SrcReg1) {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cPt),
+                  DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+        }
+        if (DestReg != SrcReg2) {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cNotPt),
+                  DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+        }
+        MII = MBB->erase(MI);
+        --MII;
+      } else if (Opc == Hexagon::TFR_condset_ii) {
+        int DestReg = MI->getOperand(0).getReg();
+        int SrcReg1 = MI->getOperand(1).getReg();
+        int Immed1 = MI->getOperand(2).getImm();
+        int Immed2 = MI->getOperand(3).getImm();
+        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cPt),
+                DestReg).addReg(SrcReg1).addImm(Immed1);
+        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cNotPt),
+                DestReg).addReg(SrcReg1).addImm(Immed2);
+        MII = MBB->erase(MI);
+        --MII;
+      }
+    }
+  }
+
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonSplitTFRCondSets(HexagonTargetMachine &TM) {
+  return new HexagonSplitTFRCondSets(TM);
+}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
new file mode 100644
index 0000000..83fb498
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -0,0 +1,59 @@
+//===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonSubtarget.h"
+#include "Hexagon.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableV3("enable-hexagon-v3", cl::Hidden,
+         cl::desc("Enable Hexagon V3 instructions."));
+
+static cl::opt<bool>
+EnableMemOps(
+    "enable-hexagon-memops",
+    cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed,
+    cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
+
+HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
+  HexagonGenSubtargetInfo(TT, CPU, FS),
+  HexagonArchVersion(V1),
+  CPUString(CPU.str()) {
+  ParseSubtargetFeatures(CPU, FS);
+
+  switch(HexagonArchVersion) {
+  case HexagonSubtarget::V2:
+    break;
+  case HexagonSubtarget::V3:
+    EnableV3 = true;
+    break;
+  case HexagonSubtarget::V4:
+    break;
+  default:
+    llvm_unreachable("Unknown Architecture Version.");
+  }
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUString);
+
+  if (EnableMemOps)
+    UseMemOps = true;
+  else
+    UseMemOps = false;
+}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
new file mode 100644
index 0000000..6de85df
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -0,0 +1,74 @@
+//==-- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Hexagon_SUBTARGET_H
+#define Hexagon_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "HexagonGenSubtargetInfo.inc"
+
+#define Hexagon_SMALL_DATA_THRESHOLD 8
+
+namespace llvm {
+
+class HexagonSubtarget : public HexagonGenSubtargetInfo {
+
+  bool UseMemOps;
+
+public:
+  enum HexagonArchEnum {
+    V1, V2, V3, V4
+  };
+
+  HexagonArchEnum HexagonArchVersion;
+  std::string CPUString;
+  InstrItineraryData InstrItins;
+
+public:
+  HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool hasV2TOps () const { return HexagonArchVersion >= V2; }
+  bool hasV2TOpsOnly () const { return HexagonArchVersion == V2; }
+  bool hasV3TOps () const { return HexagonArchVersion >= V3; }
+  bool hasV3TOpsOnly () const { return HexagonArchVersion == V3; }
+  bool hasV4TOps () const { return HexagonArchVersion >= V4; }
+  bool useMemOps () const { return HexagonArchVersion >= V4 && UseMemOps; }
+
+  bool isSubtargetV2() const { return HexagonArchVersion == V2;}
+  const std::string &getCPUString () const { return CPUString; }
+
+  // Threshold for small data section
+  unsigned getSmallDataThreshold() const {
+    return Hexagon_SMALL_DATA_THRESHOLD;
+  }
+  const HexagonArchEnum &getHexagonArchVersion() const {
+    return  HexagonArchVersion;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
new file mode 100644
index 0000000..b29e92c
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -0,0 +1,118 @@
+//===-- HexagonTargetMachine.cpp - Define TargetMachine for Hexagon -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <iostream>
+
+using namespace llvm;
+
+static cl::
+opt<bool> DisableHardwareLoops(
+                        "disable-hexagon-hwloops", cl::Hidden,
+                        cl::desc("Disable Hardware Loops for Hexagon target"));
+
+/// HexagonTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int HexagonTargetMachineModule;
+int HexagonTargetMachineModule = 0;
+
+extern "C" void LLVMInitializeHexagonTarget() {
+  // Register the target.
+  RegisterTargetMachine<HexagonTargetMachine> X(TheHexagonTarget);
+}
+
+
+/// HexagonTargetMachine ctor - Create an ILP32 architecture model.
+///
+
+/// Hexagon_TODO: Do I need an aggregate alignment?
+///
+HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           TargetOptions Options,
+                                           Reloc::Model RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    DataLayout("e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-a0:0") ,
+    Subtarget(TT, CPU, FS), TLInfo(*this), InstrInfo(Subtarget),
+    TSInfo(*this),
+    FrameLowering(Subtarget),
+    InstrItins(&Subtarget.getInstrItineraryData()) {
+  setMCUseCFI(false);
+}
+
+// addPassesForOptimizations - Allow the backend (target) to add Target
+// Independent Optimization passes to the Pass Manager.
+bool HexagonTargetMachine::addPassesForOptimizations(PassManagerBase &PM) {
+
+  PM.add(createConstantPropagationPass());
+  PM.add(createLoopSimplifyPass());
+  PM.add(createDeadCodeEliminationPass());
+  PM.add(createConstantPropagationPass());
+  PM.add(createLoopUnrollPass());
+  PM.add(createLoopStrengthReducePass(getTargetLowering()));
+  return true;
+}
+
+bool HexagonTargetMachine::addInstSelector(PassManagerBase &PM) {
+  PM.add(createHexagonRemoveExtendOps(*this));
+  PM.add(createHexagonISelDag(*this));
+  return false;
+}
+
+
+bool HexagonTargetMachine::addPreRegAlloc(PassManagerBase &PM) {
+  if (!DisableHardwareLoops) {
+    PM.add(createHexagonHardwareLoops());
+  }
+
+  return false;
+}
+
+bool HexagonTargetMachine::addPostRegAlloc(PassManagerBase &PM) {
+  PM.add(createHexagonCFGOptimizer(*this));
+  return true;
+}
+
+
+bool HexagonTargetMachine::addPreSched2(PassManagerBase &PM) {
+  PM.add(createIfConverterPass());
+  return true;
+}
+
+bool HexagonTargetMachine::addPreEmitPass(PassManagerBase &PM) {
+
+  if (!DisableHardwareLoops) {
+    PM.add(createHexagonFixupHwLoops());
+  }
+
+  // Expand Spill code for predicate registers.
+  PM.add(createHexagonExpandPredSpillCode(*this));
+
+  // Split up TFRcondsets into conditional transfers.
+  PM.add(createHexagonSplitTFRCondSets(*this));
+
+  return false;
+}
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
new file mode 100644
index 0000000..e27d3ae
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -0,0 +1,86 @@
+//=-- HexagonTargetMachine.h - Define TargetMachine for Hexagon ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonTARGETMACHINE_H
+#define HexagonTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonISelLowering.h"
+#include "HexagonSelectionDAGInfo.h"
+#include "HexagonFrameLowering.h"
+
+namespace llvm {
+
+class Module;
+
+class HexagonTargetMachine : public LLVMTargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment.
+  HexagonSubtarget Subtarget;
+  HexagonTargetLowering TLInfo;
+  HexagonInstrInfo InstrInfo;
+  HexagonSelectionDAGInfo TSInfo;
+  HexagonFrameLowering FrameLowering;
+  const InstrItineraryData* InstrItins;
+
+public:
+  HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU,
+                       StringRef FS, TargetOptions Options, Reloc::Model RM,
+                       CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  virtual const HexagonInstrInfo *getInstrInfo() const {
+    return &InstrInfo;
+  }
+  virtual const HexagonSubtarget *getSubtargetImpl() const {
+    return &Subtarget;
+  }
+  virtual const HexagonRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual const InstrItineraryData* getInstrItineraryData() const {
+    return InstrItins;
+  }
+
+
+  virtual const HexagonTargetLowering* getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  virtual const HexagonFrameLowering* getFrameLowering() const {
+    return &FrameLowering;
+  }
+
+  virtual const HexagonSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  // Pass Pipeline Configuration.
+  virtual bool addPassesForOptimizations(PassManagerBase &PM);
+  virtual bool addInstSelector(PassManagerBase &PM);
+  virtual bool addPreEmitPass(PassManagerBase &PM);
+  virtual bool addPreRegAlloc(llvm::PassManagerBase &PM);
+  virtual bool addPostRegAlloc(PassManagerBase &PM);
+  virtual bool addPreSched2(PassManagerBase &PM);
+};
+
+extern bool flag_aligned_memcpy;
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
new file mode 100644
index 0000000..188337d
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -0,0 +1,94 @@
+//===-- HexagonTargetObjectFile.cpp - Hexagon asm properties ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetObjectFile.h"
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<int> SmallDataThreshold("hexagon-small-data-threshold",
+                                cl::init(8), cl::Hidden);
+
+void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                               SectionKind::getBSS());
+}
+
+// sdata/sbss support taken largely from the MIPS Backend.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= (uint64_t)SmallDataThreshold;
+}
+/// IsGlobalInSmallSection - Return true if this global value should be
+/// placed into small data/bss section.
+bool HexagonTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
+                                                const TargetMachine &TM) const {
+  // If the primary definition of this global value is outside the current
+  // translation unit or the global value is available for inspection but not
+  // emission, then do nothing.
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+
+  // Otherwise, Check if GV should be in sdata/sbss, when normally it would end
+  // up in getKindForGlobal(GV, TM).
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global value should be
+/// placed into small data/bss section.
+bool HexagonTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
+    Type *Ty = GV->getType()->getElementType();
+    return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
+  }
+
+  return false;
+}
+
+const MCSection *HexagonTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
new file mode 100644
index 0000000..101c1f2
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties ---------*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonTARGETOBJECTFILE_H
+#define HexagonTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+  class HexagonTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSectionELF *SmallDataSection;
+    const MCSectionELF *SmallBSSSection;
+  public:
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM,
+                                SectionKind Kind) const;
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;
+
+    const MCSection* SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind,
+                                            Mangler *Mang,
+                                            const TargetMachine &TM) const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
new file mode 100644
index 0000000..21b2d67
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
@@ -0,0 +1,141 @@
+//==-- HexagonVarargsCallingConvention.h - Calling Conventions ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the functions that assign locations to outgoing function
+// arguments. Adapted from the target independent version but this handles
+// calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+//
+
+
+
+
+static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags,
+                                    Hexagon_CCState &State,
+                                    int NonVarArgsParams,
+                                    int CurrentParam,
+                                    bool ForceMem);
+
+
+static bool CC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                 EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                 ISD::ArgFlagsTy ArgFlags,
+                                 Hexagon_CCState &State,
+                                 int NonVarArgsParams,
+                                 int CurrentParam,
+                                 bool ForceMem) {
+  unsigned ByValSize = 0;
+  if (ArgFlags.isByVal() &&
+      ((ByValSize = ArgFlags.getByValSize()) >
+       (MVT(MVT::i64).getSizeInBits() / 8))) {
+    ForceMem = true;
+  }
+
+
+  // Only assign registers for named (non varargs) arguments
+  if ( !ForceMem && ((NonVarArgsParams == -1) || (CurrentParam <=
+                                                  NonVarArgsParams))) {
+
+    if (LocVT == MVT::i32 ||
+        LocVT == MVT::i16 ||
+        LocVT == MVT::i8 ||
+        LocVT == MVT::f32) {
+      static const unsigned RegList1[] = {
+        Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+        Hexagon::R5
+      };
+      if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                         LocVT.getSimpleVT(), LocInfo));
+        return false;
+      }
+    }
+
+    if (LocVT == MVT::i64 ||
+        LocVT == MVT::f64) {
+      static const unsigned RegList2[] = {
+        Hexagon::D0, Hexagon::D1, Hexagon::D2
+      };
+      if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                         LocVT.getSimpleVT(), LocInfo));
+        return false;
+      }
+    }
+  }
+
+  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
+  unsigned Alignment =
+    State.getTarget().getTargetData()->getABITypeAlignment(ArgTy);
+  unsigned Size =
+    State.getTarget().getTargetData()->getTypeSizeInBits(ArgTy) / 8;
+
+  // If it's passed by value, then we need the size of the aggregate not of
+  // the pointer.
+  if (ArgFlags.isByVal()) {
+    Size = ByValSize;
+
+    // Hexagon_TODO: Get the alignment of the contained type here.
+    Alignment = 8;
+  }
+
+  unsigned Offset3 = State.AllocateStack(Size, Alignment);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
+                                   LocVT.getSimpleVT(), LocInfo));
+  return false;
+}
+
+
+static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags,
+                                    Hexagon_CCState &State,
+                                    int NonVarArgsParams,
+                                    int CurrentParam,
+                                    bool ForceMem) {
+
+  if (LocVT == MVT::i32 ||
+      LocVT == MVT::f32) {
+    static const unsigned RegList1[] = {
+      Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+      Hexagon::R5
+    };
+    if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                       LocVT.getSimpleVT(), LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::i64 ||
+      LocVT == MVT::f64) {
+    static const unsigned RegList2[] = {
+      Hexagon::D0, Hexagon::D1, Hexagon::D2
+    };
+    if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                       LocVT.getSimpleVT(), LocInfo));
+      return false;
+    }
+  }
+
+  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
+  unsigned Alignment =
+    State.getTarget().getTargetData()->getABITypeAlignment(ArgTy);
+  unsigned Size =
+    State.getTarget().getTargetData()->getTypeSizeInBits(ArgTy) / 8;
+
+  unsigned Offset3 = State.AllocateStack(Size, Alignment);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
+                                   LocVT.getSimpleVT(), LocInfo));
+  return false;
+}
diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
new file mode 100644
index 0000000..84ea6a0
--- /dev/null
+++ b/lib/Target/Hexagon/LLVMBuild.txt
@@ -0,0 +1,32 @@
+;===- ./lib/Target/Hexagon/LLVMBuild.txt -----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = TargetInfo MCTargetDesc
+
+[component_0]
+type = TargetGroup
+name = Hexagon
+parent = Target
+has_asmprinter = 1
+
+[component_1]
+type = Library
+name = HexagonCodeGen
+parent = Hexagon
+required_libraries = AsmPrinter CodeGen Core HexagonInfo SelectionDAG Support Target MC HexagonDesc
+add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..8e3da99
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_library(LLVMHexagonDesc
+  HexagonMCTargetDesc.cpp
+  HexagonMCAsmInfo.cpp
+  )
+
+add_dependencies(LLVMHexagonDesc HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
new file mode 100644
index 0000000..188693c
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -0,0 +1,36 @@
+//===-- HexagonMCAsmInfo.cpp - Hexagon asm properties -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCAsmInfo.h"
+
+using namespace llvm;
+
+HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "//";
+  HasLEB128 = true;
+
+  PrivateGlobalPrefix = ".L";
+  LCOMMDirectiveType = LCOMM::ByteAlignment;
+  InlineAsmStart = "# InlineAsm Start";
+  InlineAsmEnd = "# InlineAsm End";
+  ZeroDirective = "\t.space\t";
+  AscizDirective = "\t.string\t";
+  WeakRefDirective = "\t.weak\t";
+
+  UsesELFSectionDirectiveForBSS  = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
new file mode 100644
index 0000000..8196e95
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties ---------*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the HexagonMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonMCASMINFO_H
+#define HexagonMCASMINFO_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+
+  class HexagonMCAsmInfo : public MCAsmInfo {
+  public:
+    explicit HexagonMCAsmInfo(const Target &T, StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
new file mode 100644
index 0000000..625f07c
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -0,0 +1,94 @@
+//===-- HexagonMCTargetDesc.cpp - Cell Hexagon Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Cell Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCTargetDesc.h"
+#include "HexagonMCAsmInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "HexagonGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createHexagonMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitHexagonMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitHexagonMCRegisterInfo(X, Hexagon::R0);
+  return X;
+}
+
+static MCSubtargetInfo *createHexagonMCSubtargetInfo(StringRef TT,
+                                                     StringRef CPU,
+                                                     StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitHexagonMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createHexagonMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new HexagonMCAsmInfo(T, TT);
+
+  // VirtualFP = (R30 + #0).
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(Hexagon::R30, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  X->InitMCCodeGenInfo(Reloc::Static, CM, OL);
+  return X;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeHexagonTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheHexagonTarget, createHexagonMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheHexagonTarget,
+                                        createHexagonMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheHexagonTarget, createHexagonMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheHexagonTarget,
+                                    createHexagonMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheHexagonTarget,
+                                          createHexagonMCSubtargetInfo);
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
new file mode 100644
index 0000000..364841f
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -0,0 +1,40 @@
+//===-- SPUMCTargetDesc.h - Hexagon Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUMCTARGETDESC_H
+#define SPUMCTARGETDESC_H
+
+namespace llvm {
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+
+extern Target TheHexagonTarget;
+
+} // End llvm namespace
+
+// Define symbolic names for Hexagon registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "HexagonGenRegisterInfo.inc"
+
+// Defines symbolic names for the Hexagon instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "HexagonGenSubtargetInfo.inc"
+
+#endif
diff --git a/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt b/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000..1114d99
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/CellSPU/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = HexagonDesc
+parent = Hexagon
+required_libraries = HexagonInfo MC
+add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/MCTargetDesc/Makefile b/lib/Target/Hexagon/MCTargetDesc/Makefile
new file mode 100644
index 0000000..67be2bc
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/CellSPU/TargetDesc/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMHexagonDesc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/Makefile b/lib/Target/Hexagon/Makefile
new file mode 100644
index 0000000..c936e92
--- /dev/null
+++ b/lib/Target/Hexagon/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/Hexagon/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMHexagonCodeGen
+TARGET = Hexagon
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = HexagonGenRegisterInfo.inc \
+                HexagonGenInstrInfo.inc  \
+                HexagonGenAsmWriter.inc \
+                HexagonGenDAGISel.inc HexagonGenSubtargetInfo.inc \
+                HexagonGenCallingConv.inc \
+                HexagonAsmPrinter.cpp
+
+DIRS = TargetInfo MCTargetDesc
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/TargetInfo/CMakeLists.txt b/lib/Target/Hexagon/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..5b04a30
--- /dev/null
+++ b/lib/Target/Hexagon/TargetInfo/CMakeLists.txt
@@ -0,0 +1,8 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. 
+                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMHexagonInfo
+  HexagonTargetInfo.cpp
+  )
+
+add_dependencies(LLVMHexagonInfo HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
new file mode 100644
index 0000000..7aa5dd3
--- /dev/null
+++ b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- HexagonTargetInfo.cpp - Hexagon Target Implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "llvm/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheHexagonTarget;
+
+extern "C" void LLVMInitializeHexagonTargetInfo() {
+  RegisterTarget<Triple::hexagon, /*HasJIT=*/false>  X(TheHexagonTarget, "hexagon", "Hexagon");
+}
diff --git a/lib/Target/Hexagon/TargetInfo/LLVMBuild.txt b/lib/Target/Hexagon/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000..7b87be3
--- /dev/null
+++ b/lib/Target/Hexagon/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Hexagon/TargetInfo/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = HexagonInfo
+parent = Hexagon
+required_libraries = MC Support
+add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/TargetInfo/Makefile b/lib/Target/Hexagon/TargetInfo/Makefile
new file mode 100644
index 0000000..494cca1
--- /dev/null
+++ b/lib/Target/Hexagon/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Hexagon/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMHexagonInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 358cbc8..5a42ca5 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = ARM CBackend CellSPU CppBackend Hexagon MBlaze MSP430 Mips PTX PowerPC Sparc X86 XCore
+
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
 ; interpreter).
diff --git a/lib/Target/MBlaze/AsmParser/CMakeLists.txt b/lib/Target/MBlaze/AsmParser/CMakeLists.txt
index ec8f52a..813767b 100644
--- a/lib/Target/MBlaze/AsmParser/CMakeLists.txt
+++ b/lib/Target/MBlaze/AsmParser/CMakeLists.txt
@@ -6,11 +6,4 @@ add_llvm_library(LLVMMBlazeAsmParser
   MBlazeAsmParser.cpp
   )
 
-add_llvm_library_dependencies(LLVMMBlazeAsmParser
-  LLVMMBlazeInfo
-  LLVMMC
-  LLVMMCParser
-  LLVMSupport
-  )
-
 add_dependencies(LLVMMBlazeAsmParser MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/AsmParser/LLVMBuild.txt b/lib/Target/MBlaze/AsmParser/LLVMBuild.txt
index 2c61a7f..b10189a 100644
--- a/lib/Target/MBlaze/AsmParser/LLVMBuild.txt
+++ b/lib/Target/MBlaze/AsmParser/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MBlazeAsmParser
 parent = MBlaze
 required_libraries = MBlazeInfo MC MCParser Support
 add_to_library_groups = MBlaze
-
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
index d3f1383..71095e5 100644
--- a/lib/Target/MBlaze/CMakeLists.txt
+++ b/lib/Target/MBlaze/CMakeLists.txt
@@ -29,19 +29,6 @@ add_llvm_target(MBlazeCodeGen
   MBlazeELFWriterInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMMBlazeCodeGen
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMBlazeAsmPrinter
-  LLVMMBlazeDesc
-  LLVMMBlazeInfo
-  LLVMMC
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/MBlaze/Disassembler/CMakeLists.txt b/lib/Target/MBlaze/Disassembler/CMakeLists.txt
index e0a53ee..be2dce1 100644
--- a/lib/Target/MBlaze/Disassembler/CMakeLists.txt
+++ b/lib/Target/MBlaze/Disassembler/CMakeLists.txt
@@ -13,11 +13,4 @@ set_property(
   )
 endif()
 
-add_llvm_library_dependencies(LLVMMBlazeDisassembler
-  LLVMMBlazeDesc
-  LLVMMBlazeInfo
-  LLVMMC
-  LLVMSupport
-  )
-
 add_dependencies(LLVMMBlazeDisassembler MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/Disassembler/LLVMBuild.txt b/lib/Target/MBlaze/Disassembler/LLVMBuild.txt
index c5c4f80..28dd9dc 100644
--- a/lib/Target/MBlaze/Disassembler/LLVMBuild.txt
+++ b/lib/Target/MBlaze/Disassembler/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MBlazeDisassembler
 parent = MBlaze
 required_libraries = MBlazeDesc MBlazeInfo MC Support
 add_to_library_groups = MBlaze
-
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
index 3087317..ccc3a05 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -123,6 +123,7 @@ static unsigned decodeSEXT(uint32_t insn) {
     case 0x41: return MBlaze::SRL;
     case 0x21: return MBlaze::SRC;
     case 0x01: return MBlaze::SRA;
+    case 0xE0: return MBlaze::CLZ;
     }
 }
 
@@ -176,6 +177,13 @@ static unsigned decodeBR(uint32_t insn) {
 }
 
 static unsigned decodeBRI(uint32_t insn) {
+    switch (insn&0x3FFFFFF) {
+    default:        break;
+    case 0x0020004: return MBlaze::IDMEMBAR;
+    case 0x0220004: return MBlaze::DMEMBAR;
+    case 0x0420004: return MBlaze::IMEMBAR;
+    }
+
     switch ((insn>>16)&0x1F) {
     default:   return UNSUPPORTED;
     case 0x00: return MBlaze::BRI;
@@ -531,6 +539,9 @@ MCDisassembler::DecodeStatus MBlazeDisassembler::getInstruction(MCInst &instr,
   default: 
     return Fail;
 
+  case MBlazeII::FC:
+    break;
+
   case MBlazeII::FRRRR:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
       return Fail;
@@ -547,6 +558,13 @@ MCDisassembler::DecodeStatus MBlazeDisassembler::getInstruction(MCInst &instr,
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
+  case MBlazeII::FRR:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    break;
+
   case MBlazeII::FRI:
     switch (opcode) {
     default: 
diff --git a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
index aff0b3d..586e2d3 100644
--- a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
@@ -5,9 +5,4 @@ add_llvm_library(LLVMMBlazeAsmPrinter
   MBlazeInstPrinter.cpp
   )
 
-add_llvm_library_dependencies(LLVMMBlazeAsmPrinter
-  LLVMMC
-  LLVMSupport
-  )
-
 add_dependencies(LLVMMBlazeAsmPrinter MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/InstPrinter/LLVMBuild.txt b/lib/Target/MBlaze/InstPrinter/LLVMBuild.txt
index 7a21f1e..3a21a05 100644
--- a/lib/Target/MBlaze/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/MBlaze/InstPrinter/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MBlazeAsmPrinter
 parent = MBlaze
 required_libraries = MC Support
 add_to_library_groups = MBlaze
-
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
index 570ab08..5297563 100644
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
+++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
@@ -1,4 +1,4 @@
-//===-- MBLazeInstPrinter.h - Convert MBlaze MCInst to assembly syntax ----===//
+//===-- MBlazeInstPrinter.h - Convert MBlaze MCInst to assembly syntax ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/LLVMBuild.txt b/lib/Target/MBlaze/LLVMBuild.txt
index f1a3f5d..0b29007 100644
--- a/lib/Target/MBlaze/LLVMBuild.txt
+++ b/lib/Target/MBlaze/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = MBlaze
@@ -29,4 +32,3 @@ name = MBlazeCodeGen
 parent = MBlaze
 required_libraries = AsmPrinter CodeGen Core MBlazeAsmPrinter MBlazeDesc MBlazeInfo MC SelectionDAG Support Target
 add_to_library_groups = MBlaze
-
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index ff051e3..c751dd8 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -310,9 +310,9 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
 
   // Check if the last terminator is an unconditional branch.
   MachineBasicBlock::const_iterator I = Pred->end();
-  while (I != Pred->begin() && !(--I)->getDesc().isTerminator())
+  while (I != Pred->begin() && !(--I)->isTerminator())
     ; // Noop
-  return I == Pred->end() || !I->getDesc().isBarrier();
+  return I == Pred->end() || !I->isBarrier();
 }
 
 // Force static initialization.
diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
index c07570a..19e787d 100644
--- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
+++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
@@ -29,13 +29,11 @@ using namespace llvm;
 
 STATISTIC(FilledSlots, "Number of delay slots filled");
 
-namespace llvm {
-cl::opt<bool> DisableDelaySlotFiller(
+static cl::opt<bool> MBDisableDelaySlotFiller(
   "disable-mblaze-delay-filler",
   cl::init(false),
   cl::desc("Disable the MBlaze delay slot filter."),
   cl::Hidden);
-}
 
 namespace {
   struct Filler : public MachineFunctionPass {
@@ -109,7 +107,6 @@ static bool delayHasHazard(MachineBasicBlock::iterator &candidate,
   // Hazard check
   MachineBasicBlock::iterator a = candidate;
   MachineBasicBlock::iterator b = slot;
-  MCInstrDesc desc = candidate->getDesc();
 
   // MBB layout:-
   //    candidate := a0 = operation(a1, a2)
@@ -123,7 +120,7 @@ static bool delayHasHazard(MachineBasicBlock::iterator &candidate,
   // 4. b0 is one or more of {a1, a2}
   // 5. a accesses memory, and the middle bit
   //    contains a store operation.
-  bool a_is_memory = desc.mayLoad() || desc.mayStore();
+  bool a_is_memory = candidate->mayLoad() || candidate->mayStore();
 
   // Determine the number of operands in the slot instruction and in the
   // candidate instruction.
@@ -156,7 +153,7 @@ static bool delayHasHazard(MachineBasicBlock::iterator &candidate,
     }
 
     // Check hazard type 5
-    if (a_is_memory && m->getDesc().mayStore())
+    if (a_is_memory && m->mayStore())
       return true;
   }
 
@@ -183,8 +180,8 @@ static bool isDelayFiller(MachineBasicBlock &MBB,
   if (candidate == MBB.begin())
     return false;
 
-  MCInstrDesc brdesc = (--candidate)->getDesc();
-  return (brdesc.hasDelaySlot());
+  --candidate;
+  return (candidate->hasDelaySlot());
 }
 
 static bool hasUnknownSideEffects(MachineBasicBlock::iterator &I) {
@@ -211,9 +208,8 @@ findDelayInstr(MachineBasicBlock &MBB,MachineBasicBlock::iterator slot) {
       break;
 
     --I;
-    MCInstrDesc desc = I->getDesc();
-    if (desc.hasDelaySlot() || desc.isBranch() || isDelayFiller(MBB,I) ||
-        desc.isCall() || desc.isReturn() || desc.isBarrier() ||
+    if (I->hasDelaySlot() || I->isBranch() || isDelayFiller(MBB,I) ||
+        I->isCall() || I->isReturn() || I->isBarrier() ||
         hasUnknownSideEffects(I))
       break;
 
@@ -232,11 +228,11 @@ findDelayInstr(MachineBasicBlock &MBB,MachineBasicBlock::iterator slot) {
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
-    if (I->getDesc().hasDelaySlot()) {
+    if (I->hasDelaySlot()) {
       MachineBasicBlock::iterator D = MBB.end();
       MachineBasicBlock::iterator J = I;
 
-      if (!DisableDelaySlotFiller)
+      if (!MBDisableDelaySlotFiller)
         D = findDelayInstr(MBB,I);
 
       ++FilledSlots;
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
index f28d5a7..37919bc 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
@@ -32,13 +32,11 @@
 
 using namespace llvm;
 
-namespace llvm {
-  cl::opt<bool> DisableStackAdjust(
-    "disable-mblaze-stack-adjust",
-    cl::init(false),
-    cl::desc("Disable MBlaze stack layout adjustment."),
-    cl::Hidden);
-}
+static cl::opt<bool> MBDisableStackAdjust(
+  "disable-mblaze-stack-adjust",
+  cl::init(false),
+  cl::desc("Disable MBlaze stack layout adjustment."),
+  cl::Hidden);
 
 static void replaceFrameIndexes(MachineFunction &MF, 
                                 SmallVector<std::pair<int,int64_t>, 16> &FR) {
@@ -85,7 +83,7 @@ static void replaceFrameIndexes(MachineFunction &MF,
 //===----------------------------------------------------------------------===//
 
 static void analyzeFrameIndexes(MachineFunction &MF) {
-  if (DisableStackAdjust) return;
+  if (MBDisableStackAdjust) return;
 
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
@@ -336,7 +334,8 @@ int MBlazeFrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI)
 // if frame pointer elimination is disabled.
 bool MBlazeFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         MFI->hasVarSizedObjects();
 }
 
 void MBlazeFrameLowering::emitPrologue(MachineFunction &MF) const {
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 148d906..0002174 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -167,7 +167,9 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Expand);
   setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Expand);
   setOperationAction(ISD::CTLZ,               MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,    MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ,               MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::i32,   Expand);
   setOperationAction(ISD::CTPOP,              MVT::i32,   Expand);
   setOperationAction(ISD::BSWAP,              MVT::i32,   Expand);
 
diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td
index 54f605f..4c6034d 100644
--- a/lib/Target/MBlaze/MBlazeInstrFormats.td
+++ b/lib/Target/MBlaze/MBlazeInstrFormats.td
@@ -35,6 +35,7 @@ def FRIR    : Format<17>; // RSUBI
 def FRRRR   : Format<18>; // RSUB, FRSUB
 def FRI     : Format<19>; // RSUB, FRSUB
 def FC      : Format<20>; // NOP
+def FRR     : Format<21>; // CLZ
 
 //===----------------------------------------------------------------------===//
 //  Describe MBlaze instructions format
@@ -202,3 +203,26 @@ class MSR<bits<6> op, bits<6> flags, dag outs, dag ins, string asmstr,
   let Inst{11-16} = flags;
   let Inst{17-31} = imm15;
 }
+
+//===----------------------------------------------------------------------===//
+// TCLZ instruction class in MBlaze : <|opcode|rd|imm15|>
+//===----------------------------------------------------------------------===//
+class TCLZ<bits<6> op, bits<16> flags, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<op, FRR, outs, ins, asmstr, pattern, itin> {
+  bits<5>  rd;
+  bits<5>  ra;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15}  = ra;
+  let Inst{16-31}  = flags;
+}
+
+//===----------------------------------------------------------------------===//
+// MBAR instruction class in MBlaze : <|opcode|rd|imm15|>
+//===----------------------------------------------------------------------===//
+class MBAR<bits<6> op, bits<26> flags, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<op, FC, outs, ins, asmstr, pattern, itin> {
+  let Inst{6-31}  = flags;
+}
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 1d8c987..9fe2a49 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -594,9 +594,18 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
 //===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
-  def NOP :  MBlazeInst< 0x20, FC, (outs), (ins), "nop    ", [], IIC_ALU>;
+  def NOP :  MBlazeInst<0x20, FC, (outs), (ins), "nop    ", [], IIC_ALU>;
 }
 
+let Predicates=[HasPatCmp] in {
+  def CLZ :  TCLZ<0x24, 0x00E0, (outs GPR:$dst), (ins GPR:$src),
+                  "clz    $dst, $src", [], IIC_ALU>;
+}
+
+def IMEMBAR  : MBAR<0x2E, 0x0420004, (outs), (ins), "mbar   2", [], IIC_ALU>;
+def DMEMBAR  : MBAR<0x2E, 0x0220004, (outs), (ins), "mbar   1", [], IIC_ALU>;
+def IDMEMBAR : MBAR<0x2E, 0x0020004, (outs), (ins), "mbar   0", [], IIC_ALU>;
+
 let usesCustomInserter = 1 in {
   def Select_CC : MBlazePseudo<(outs GPR:$dst),
     (ins GPR:$T, GPR:$F, GPR:$CMP, i32imm:$CC), // F T reversed
@@ -751,6 +760,56 @@ def : Pat<(sra GPR:$L, GPR:$R), (ShiftRA GPR:$L, GPR:$R)>;
 def : Pat<(srl GPR:$L, GPR:$R), (ShiftRL GPR:$L, GPR:$R)>;
 
 // SET_CC operations
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETEQ),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 1)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETNE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 2)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 3)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETLT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 4)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 5)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETLE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 6)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETUGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 3)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETULT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 4)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETUGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 5)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETULE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 6)>;
+
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETEQ),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 1)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETNE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 2)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 3)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETLT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 4)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 5)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETLE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 6)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETUGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 3)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETULT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 4)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETUGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 5)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETULE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 6)>;
+
 def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETEQ),
           (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
                      (CMP GPR:$R, GPR:$L), 1)>;
@@ -787,6 +846,68 @@ def : Pat<(select (i32 GPR:$C), (i32 GPR:$T), (i32 GPR:$F)),
           (Select_CC GPR:$T, GPR:$F, GPR:$C, 2)>;
 
 // SELECT_CC
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 1)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 2)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 3)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 4)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 5)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 6)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 3)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 4)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 5)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 6)>;
+
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 1)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 2)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 3)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 4)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 5)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 6)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 3)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 4)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 5)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 6)>;
+
 def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
                     (i32 GPR:$T), (i32 GPR:$F), SETEQ),
           (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 1)>;
@@ -827,6 +948,48 @@ def : Pat<(br bb:$T), (BRID bb:$T)>;
 def : Pat<(brind GPR:$T), (BRAD GPR:$T)>;
 
 // BRCOND instructions
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETEQ), bb:$T),
+          (BEQID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETNE), bb:$T),
+          (BNEID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETGT), bb:$T),
+          (BGTID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETLT), bb:$T),
+          (BLTID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETGE), bb:$T),
+          (BGEID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETLE), bb:$T),
+          (BLEID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETUGT), bb:$T),
+          (BGTID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETULT), bb:$T),
+          (BLTID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETUGE), bb:$T),
+          (BGEID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETULE), bb:$T),
+          (BLEID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETEQ), bb:$T),
+          (BEQID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETNE), bb:$T),
+          (BNEID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETGT), bb:$T),
+          (BGTID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETLT), bb:$T),
+          (BLTID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETGE), bb:$T),
+          (BGEID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETLE), bb:$T),
+          (BLEID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETUGT), bb:$T),
+          (BGTID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETULT), bb:$T),
+          (BLTID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETUGE), bb:$T),
+          (BGEID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETULE), bb:$T),
+          (BLEID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+
 def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETEQ), bb:$T),
           (BEQID (CMP GPR:$R, GPR:$L), bb:$T)>;
 def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETNE), bb:$T),
@@ -869,11 +1032,11 @@ def : Pat<(store (i32 GPR:$dst), xaddr:$addr), (SW GPR:$dst, xaddr:$addr)>;
 def : Pat<(load xaddr:$addr), (i32 (LW xaddr:$addr))>;
 
 // 16-bit load and store
-def : Pat<(truncstorei16 (i32 GPR:$dst), xaddr:$addr), (SH GPR:$dst, xaddr:$addr)>;
+def : Pat<(truncstorei16 (i32 GPR:$dst), xaddr:$ad), (SH GPR:$dst, xaddr:$ad)>;
 def : Pat<(zextloadi16 xaddr:$addr), (i32 (LHU xaddr:$addr))>;
 
 // 8-bit load and store
-def : Pat<(truncstorei8 (i32 GPR:$dst), xaddr:$addr), (SB GPR:$dst, xaddr:$addr)>;
+def : Pat<(truncstorei8 (i32 GPR:$dst), xaddr:$ad), (SB GPR:$dst, xaddr:$ad)>;
 def : Pat<(zextloadi8 xaddr:$addr), (i32 (LBU xaddr:$addr))>;
 
 // Peepholes
diff --git a/lib/Target/MBlaze/MBlazeMCInstLower.cpp b/lib/Target/MBlaze/MBlazeMCInstLower.cpp
index a7e400b..7e5598f 100644
--- a/lib/Target/MBlaze/MBlazeMCInstLower.cpp
+++ b/lib/Target/MBlaze/MBlazeMCInstLower.cpp
@@ -1,4 +1,4 @@
-//===-- MBLazeMCInstLower.cpp - Convert MBlaze MachineInstr to an MCInst---===//
+//===-- MBlazeMCInstLower.cpp - Convert MBlaze MachineInstr to an MCInst---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index 4ad7bd6..5ed81dd 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -33,16 +33,16 @@ extern "C" void LLVMInitializeMBlazeTarget() {
 // an easier handling.
 MBlazeTargetMachine::
 MBlazeTargetMachine(const Target &T, StringRef TT,
-                    StringRef CPU, StringRef FS,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
                     Reloc::Model RM, CodeModel::Model CM,
-                    CodeGenOpt::Level OL):
-  LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
-  Subtarget(TT, CPU, FS),
-  DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
-  InstrInfo(*this),
-  FrameLowering(Subtarget),
-  TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
-  InstrItins(Subtarget.getInstrItineraryData()) {
+                    CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
+    InstrInfo(*this),
+    FrameLowering(Subtarget),
+    TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()) {
 }
 
 // Install an instruction selector pass using
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
index 1c1aa53..036f1b6 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -43,6 +43,7 @@ namespace llvm {
   public:
     MBlazeTargetMachine(const Target &T, StringRef TT,
                         StringRef CPU, StringRef FS,
+                        const TargetOptions &Options,
                         Reloc::Model RM, CodeModel::Model CM,
                         CodeGenOpt::Level OL);
 
diff --git a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt b/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
index 37871b6..6fa7f43 100644
--- a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
@@ -5,11 +5,4 @@ add_llvm_library(LLVMMBlazeDesc
   MBlazeMCTargetDesc.cpp
   )
 
-add_llvm_library_dependencies(LLVMMBlazeDesc
-  LLVMMBlazeAsmPrinter
-  LLVMMBlazeInfo
-  LLVMMC
-  LLVMSupport
-  )
-
 add_dependencies(LLVMMBlazeDesc MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt b/lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt
index e89811b..4982f0f 100644
--- a/lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MBlazeDesc
 parent = MBlaze
 required_libraries = MBlazeAsmPrinter MBlazeInfo MC Support
 add_to_library_groups = MBlaze
-
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
index 08f7d46..d5acbe9 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
@@ -58,6 +58,11 @@ public:
 
   bool MayNeedRelaxation(const MCInst &Inst) const;
 
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCInstFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
   void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
 
   bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
@@ -87,6 +92,18 @@ bool MBlazeAsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
   return hasExprOrImm;
 }
 
+bool MBlazeAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                            uint64_t Value,
+                                            const MCInstFragment *DF,
+                                            const MCAsmLayout &Layout) const {
+  // FIXME: Is this right? It's what the "generic" code was doing before,
+  // but is X86 specific. Is it actually true for MBlaze also, or was it
+  // just close enough to not be a big deal?
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
 void MBlazeAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
   Res = Inst;
   Res.setOpcode(getRelaxedOpcode(Inst.getOpcode()));
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
index 776dbc4..c8bdd6f 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
@@ -51,6 +51,7 @@ namespace MBlazeII {
     FRRRR,
     FRI,
     FC,
+    FRR,
     FormMask = 63
 
     //===------------------------------------------------------------------===//
diff --git a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
index 93fce58..b554d9b 100644
--- a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
+++ b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
@@ -5,10 +5,4 @@ add_llvm_library(LLVMMBlazeInfo
   MBlazeTargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMMBlazeInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMMBlazeInfo MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/TargetInfo/LLVMBuild.txt b/lib/Target/MBlaze/TargetInfo/LLVMBuild.txt
index 938a1d9..ba7ee5d 100644
--- a/lib/Target/MBlaze/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/MBlaze/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MBlazeInfo
 parent = MBlaze
 required_libraries = MC Support Target
 add_to_library_groups = MBlaze
-
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index 55c2d7d..7daa7a2 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -22,19 +22,6 @@ add_llvm_target(MSP430CodeGen
   MSP430MCInstLower.cpp
   )
 
-add_llvm_library_dependencies(LLVMMSP430CodeGen
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMMSP430AsmPrinter
-  LLVMMSP430Desc
-  LLVMMSP430Info
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/MSP430/InstPrinter/CMakeLists.txt b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
index ce39d95..64ac994 100644
--- a/lib/Target/MSP430/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
@@ -4,9 +4,4 @@ add_llvm_library(LLVMMSP430AsmPrinter
   MSP430InstPrinter.cpp
   )
 
-add_llvm_library_dependencies(LLVMMSP430AsmPrinter
-  LLVMMC
-  LLVMSupport
-  )
-
 add_dependencies(LLVMMSP430AsmPrinter MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/InstPrinter/LLVMBuild.txt b/lib/Target/MSP430/InstPrinter/LLVMBuild.txt
index aeb863a..37b8c25 100644
--- a/lib/Target/MSP430/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/MSP430/InstPrinter/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MSP430AsmPrinter
 parent = MSP430
 required_libraries = MC Support
 add_to_library_groups = MSP430
-
diff --git a/lib/Target/MSP430/LLVMBuild.txt b/lib/Target/MSP430/LLVMBuild.txt
index 024312b..51d9702 100644
--- a/lib/Target/MSP430/LLVMBuild.txt
+++ b/lib/Target/MSP430/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = InstPrinter MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = MSP430
@@ -27,4 +30,3 @@ name = MSP430CodeGen
 parent = MSP430
 required_libraries = AsmPrinter CodeGen Core MC MSP430AsmPrinter MSP430Desc MSP430Info SelectionDAG Support Target
 add_to_library_groups = MSP430
-
diff --git a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
index c2dd448..adc95c5 100644
--- a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
@@ -3,12 +3,4 @@ add_llvm_library(LLVMMSP430Desc
   MSP430MCAsmInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMMSP430Desc
-  LLVMMC
-  LLVMMSP430AsmPrinter
-  LLVMMSP430Info
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMMSP430Desc MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt b/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
index 1890e9d..3319d93 100644
--- a/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MSP430Desc
 parent = MSP430
 required_libraries = MC MSP430AsmPrinter MSP430Info Support Target
 add_to_library_groups = MSP430
-
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index c99f4ab..e406ff2 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 bool MSP430FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
-  return (DisableFramePointerElim(MF) ||
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           MF.getFrameInfo()->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken());
 }
@@ -140,7 +140,7 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = prior(MBBI);
     unsigned Opc = PI->getOpcode();
-    if (Opc != MSP430::POP16r && !PI->getDesc().isTerminator())
+    if (Opc != MSP430::POP16r && !PI->isTerminator())
       break;
     --MBBI;
   }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 5c94137..884d69b 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -122,8 +122,12 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
 
   setOperationAction(ISD::CTTZ,             MVT::i8,    Expand);
   setOperationAction(ISD::CTTZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i16,   Expand);
   setOperationAction(ISD::CTLZ,             MVT::i8,    Expand);
   setOperationAction(ISD::CTLZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i16,   Expand);
   setOperationAction(ISD::CTPOP,            MVT::i8,    Expand);
   setOperationAction(ISD::CTPOP,            MVT::i16,   Expand);
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 81f766e..9d3c7e9 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -158,13 +158,12 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
 }
 
 bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.isTerminator()) return false;
+  if (!MI->isTerminator()) return false;
 
   // Conditional branch is a special case.
-  if (MCID.isBranch() && !MCID.isBarrier())
+  if (MI->isBranch() && !MI->isBarrier())
     return true;
-  if (!MCID.isPredicable())
+  if (!MI->isPredicable())
     return true;
   return !isPredicated(MI);
 }
@@ -189,7 +188,7 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     // A terminator that isn't a branch can't easily be handled
     // by this analysis.
-    if (!I->getDesc().isBranch())
+    if (!I->isBranch())
       return true;
 
     // Cannot handle indirect branches.
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index fe185fb..a0fc3da 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -28,9 +28,10 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
                                          StringRef TT,
                                          StringRef CPU,
                                          StringRef FS,
+                                         const TargetOptions &Options,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     // FIXME: Check TargetData string.
     DataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 4fb060f..28d482a 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -39,7 +39,7 @@ class MSP430TargetMachine : public LLVMTargetMachine {
 
 public:
   MSP430TargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
 
diff --git a/lib/Target/MSP430/TargetInfo/CMakeLists.txt b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
index 1526946..f6b40ea 100644
--- a/lib/Target/MSP430/TargetInfo/CMakeLists.txt
+++ b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMMSP430Info
   MSP430TargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMMSP430Info
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMMSP430Info MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/TargetInfo/LLVMBuild.txt b/lib/Target/MSP430/TargetInfo/LLVMBuild.txt
index a745ea8..deafc2d 100644
--- a/lib/Target/MSP430/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/MSP430/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MSP430Info
 parent = MSP430
 required_libraries = MC Support Target
 add_to_library_groups = MSP430
-
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index ac9cfc0..a13c0e8 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -29,19 +29,6 @@ add_llvm_target(MipsCodeGen
   MipsSelectionDAGInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMMipsCodeGen
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMMipsAsmPrinter
-  LLVMMipsDesc
-  LLVMMipsInfo
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Mips/InstPrinter/CMakeLists.txt b/lib/Target/Mips/InstPrinter/CMakeLists.txt
index c45b35d..3e9fbf1 100644
--- a/lib/Target/Mips/InstPrinter/CMakeLists.txt
+++ b/lib/Target/Mips/InstPrinter/CMakeLists.txt
@@ -4,9 +4,4 @@ add_llvm_library(LLVMMipsAsmPrinter
   MipsInstPrinter.cpp
   )
 
-add_llvm_library_dependencies(LLVMMipsAsmPrinter
-  LLVMMC
-  LLVMSupport
-  )
-
 add_dependencies(LLVMMipsAsmPrinter MipsCommonTableGen)
diff --git a/lib/Target/Mips/InstPrinter/LLVMBuild.txt b/lib/Target/Mips/InstPrinter/LLVMBuild.txt
index d953a61..317057b 100644
--- a/lib/Target/Mips/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/Mips/InstPrinter/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MipsAsmPrinter
 parent = Mips
 required_libraries = MC Support
 add_to_library_groups = Mips
-
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index f544d39..3e9c46a 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -96,10 +96,14 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   case MCSymbolRefExpr::VK_None:     break;
   case MCSymbolRefExpr::VK_Mips_GPREL:    OS << "%gp_rel("; break;
   case MCSymbolRefExpr::VK_Mips_GOT_CALL: OS << "%call16("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT16:    OS << "%got(";    break;
   case MCSymbolRefExpr::VK_Mips_GOT:      OS << "%got(";    break;
   case MCSymbolRefExpr::VK_Mips_ABS_HI:   OS << "%hi(";     break;
   case MCSymbolRefExpr::VK_Mips_ABS_LO:   OS << "%lo(";     break;
   case MCSymbolRefExpr::VK_Mips_TLSGD:    OS << "%tlsgd(";  break;
+  case MCSymbolRefExpr::VK_Mips_TLSLDM:   OS << "%tlsldm(";  break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_HI:OS << "%dtprel_hi(";  break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_LO:OS << "%dtprel_lo(";  break;
   case MCSymbolRefExpr::VK_Mips_GOTTPREL: OS << "%gottprel("; break;
   case MCSymbolRefExpr::VK_Mips_TPREL_HI: OS << "%tprel_hi("; break;
   case MCSymbolRefExpr::VK_Mips_TPREL_LO: OS << "%tprel_lo("; break;
diff --git a/lib/Target/Mips/LLVMBuild.txt b/lib/Target/Mips/LLVMBuild.txt
index e733b52..bcd32bc 100644
--- a/lib/Target/Mips/LLVMBuild.txt
+++ b/lib/Target/Mips/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = InstPrinter MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = Mips
@@ -28,4 +31,3 @@ name = MipsCodeGen
 parent = Mips
 required_libraries = AsmPrinter CodeGen Core MC MipsAsmPrinter MipsDesc MipsInfo SelectionDAG Support Target
 add_to_library_groups = Mips
-
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 2ceb5c9..0eb0a55 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -5,11 +5,4 @@ add_llvm_library(LLVMMipsDesc
   MipsMCTargetDesc.cpp
   )
 
-add_llvm_library_dependencies(LLVMMipsDesc
-  LLVMMC
-  LLVMMipsAsmPrinter
-  LLVMMipsInfo
-  LLVMSupport
-  )
-
 add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/LLVMBuild.txt b/lib/Target/Mips/MCTargetDesc/LLVMBuild.txt
index d6f5dd2..29f5da6 100644
--- a/lib/Target/Mips/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/Mips/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MipsDesc
 parent = Mips
 required_libraries = MC MipsAsmPrinter MipsInfo Support
 add_to_library_groups = Mips
-
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 7bc5fe4..60ff4fe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -29,13 +29,19 @@
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
+// Prepare value for the target space for it
 static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
 
   // Add/subtract and shift
   switch (Kind) {
   default:
+    return 0;
+  case FK_GPRel_4:
+  case FK_Data_4:
+  case Mips::fixup_Mips_LO16:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -52,25 +58,10 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // address range.
     Value >>= 2;
     break;
-  }
-
-  // Mask off value for placement as an operand
-  switch (Kind) {
-  default:
-    break;
-  case FK_GPRel_4:
-  case FK_Data_4:
-    Value &= 0xffffffff;
-    break;
-  case Mips::fixup_Mips_26:
-    Value &= 0x03ffffff;
-    break;
-  case Mips::fixup_Mips_LO16:
-  case Mips::fixup_Mips_PC16:
-    Value &= 0x0000ffff;
-    break;
   case Mips::fixup_Mips_HI16:
-    Value >>= 16;
+  case Mips::fixup_Mips_GOT_Local:
+    // Get the higher 16-bits. Also add 1 if bit 15 is 1.
+    Value = (Value >> 16) + ((Value & 0x8000) != 0);
     break;
   }
 
@@ -96,42 +87,40 @@ public:
   /// fixup kind as appropriate.
   void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value) const {
-    unsigned Kind = (unsigned)Fixup.getKind();
-    Value = adjustFixupValue(Kind, Value);
+    MCFixupKind Kind = Fixup.getKind();
+    Value = adjustFixupValue((unsigned)Kind, Value);
 
     if (!Value)
-      return;           // Doesn't change encoding.
+      return; // Doesn't change encoding.
 
     unsigned Offset = Fixup.getOffset();
-    switch (Kind) {
-    default:
-      llvm_unreachable("Unknown fixup kind!");
-    case Mips::fixup_Mips_GOT16: // This will be fixed up at link time
-     break;
-    case FK_GPRel_4:
-    case FK_Data_4:
-    case Mips::fixup_Mips_26:
-    case Mips::fixup_Mips_LO16:
-    case Mips::fixup_Mips_PC16:
-    case Mips::fixup_Mips_HI16:
-      // For each byte of the fragment that the fixup touches, mask i
-      // the fixup value. The Value has been "split up" into the appr
-      // bitfields above.
-      for (unsigned i = 0; i != 4; ++i) // FIXME - Need to support 2 and 8 bytes
-        Data[Offset + i] += uint8_t((Value >> (i * 8)) & 0xff);
-      break;
+    // FIXME: The below code will not work across endian models
+    // How many bytes/bits are we fixing up?
+    unsigned NumBytes = ((getFixupKindInfo(Kind).TargetSize-1)/8)+1;
+    uint64_t Mask = ((uint64_t)1 << getFixupKindInfo(Kind).TargetSize) - 1;
+
+    // Grab current value, if any, from bits.
+    uint64_t CurVal = 0;
+    for (unsigned i = 0; i != NumBytes; ++i)
+      CurVal |= ((uint8_t)Data[Offset + i]) << (i * 8);
+
+    CurVal = (CurVal & ~Mask) | ((CurVal + Value) & Mask);
+
+    // Write out the bytes back to the code/data bits.
+    // First the unaffected bits and then the fixup.
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      Data[Offset + i] = uint8_t((CurVal >> (i * 8)) & 0xff);
     }
-  }
+}
 
   unsigned getNumFixupKinds() const { return Mips::NumTargetFixupKinds; }
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
     const static MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds a
+      // This table *must* be in same the order of fixup_* kinds in
       // MipsFixupKinds.h.
       //
       // name                    offset  bits  flags
-      { "fixup_Mips_NONE",         0,      0,   0 },
       { "fixup_Mips_16",           0,     16,   0 },
       { "fixup_Mips_32",           0,     32,   0 },
       { "fixup_Mips_REL32",        0,     32,   0 },
@@ -140,7 +129,8 @@ public:
       { "fixup_Mips_LO16",         0,     16,   0 },
       { "fixup_Mips_GPREL16",      0,     16,   0 },
       { "fixup_Mips_LITERAL",      0,     16,   0 },
-      { "fixup_Mips_GOT16",        0,     16,   0 },
+      { "fixup_Mips_GOT_Global",   0,     16,   0 },
+      { "fixup_Mips_GOT_Local",    0,     16,   0 },
       { "fixup_Mips_PC16",         0,     16,  MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_Mips_CALL16",       0,     16,   0 },
       { "fixup_Mips_GPREL32",      0,     32,   0 },
@@ -173,6 +163,17 @@ public:
     return false;
   }
 
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCInstFragment *DF,
+                            const MCAsmLayout &Layout) const {
+    // FIXME.
+    assert(0 && "RelaxInstruction() unimplemented");
+    return false;
+  }
+
   /// RelaxInstruction - Relax the instruction in the given fragment
   /// to the next wider instruction.
   ///
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index cebfde0..00fc5df 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -31,8 +31,9 @@ namespace MipsII {
 
     MO_NO_FLAG,
 
-    /// MO_GOT - Represents the offset into the global offset table at which
+    /// MO_GOT16 - Represents the offset into the global offset table at which
     /// the address the relocation entry symbol resides during execution.
+    MO_GOT16,
     MO_GOT,
 
     /// MO_GOT_CALL - Represents the offset into the global offset table at
@@ -55,6 +56,13 @@ namespace MipsII {
     // Dynamic TLS).
     MO_TLSGD,
 
+    /// MO_TLSLDM - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (Local
+    // Dynamic TLS).
+    MO_TLSLDM,
+    MO_DTPREL_HI,
+    MO_DTPREL_LO,
+
     /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
     // Exec TLS).
     MO_GOTTPREL,
@@ -180,6 +188,7 @@ inline static unsigned getMipsRegisterNumbering(unsigned RegEnum)
   case Mips::D14:
     return 28;
   case Mips::SP: case Mips::SP_64: case Mips::F29: case Mips::D29_64:
+  case Mips::HWR29:
     return 29;
   case Mips::FP: case Mips::FP_64: case Mips::F30: case Mips::D30_64:
   case Mips::D15: 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 20890ed..a56c002 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -14,74 +14,82 @@
 
 namespace llvm {
 namespace Mips {
-    enum Fixups {
-        // fixup_Mips_xxx - R_MIPS_NONE
-        fixup_Mips_NONE = FirstTargetFixupKind,
+  // Although most of the current fixup types reflect a unique relocation
+  // one can have multiple fixup types for a given relocation and thus need
+  // to be uniquely named.
+  //
+  // This table *must* be in the save order of
+  // MCFixupKindInfo Infos[Mips::NumTargetFixupKinds]
+  // in MipsAsmBackend.cpp.
+  //
+  enum Fixups {
+    // Branch fixups resulting in R_MIPS_16.
+    fixup_Mips_16 = FirstTargetFixupKind,
 
-        // fixup_Mips_xxx - R_MIPS_16.
-        fixup_Mips_16,
+    // Pure 32 bit data fixup resulting in - R_MIPS_32.
+    fixup_Mips_32,
 
-        // fixup_Mips_xxx - R_MIPS_32.
-        fixup_Mips_32,
+    // Full 32 bit data relative data fixup resulting in - R_MIPS_REL32.
+    fixup_Mips_REL32,
 
-        // fixup_Mips_xxx - R_MIPS_REL32.
-        fixup_Mips_REL32,
+    // Jump 26 bit fixup resulting in - R_MIPS_26.
+    fixup_Mips_26,
 
-        // fixup_Mips_xxx - R_MIPS_26.
-        fixup_Mips_26,
+    // Pure upper 16 bit fixup resulting in - R_MIPS_HI16.
+    fixup_Mips_HI16,
 
-        // fixup_Mips_xxx - R_MIPS_HI16.
-        fixup_Mips_HI16,
+    // Pure lower 16 bit fixup resulting in - R_MIPS_LO16.
+    fixup_Mips_LO16,
 
-        // fixup_Mips_xxx - R_MIPS_LO16.
-        fixup_Mips_LO16,
+    // 16 bit fixup for GP offest resulting in - R_MIPS_GPREL16.
+    fixup_Mips_GPREL16,
 
-        // fixup_Mips_xxx - R_MIPS_GPREL16.
-        fixup_Mips_GPREL16,
+    // 16 bit literal fixup resulting in - R_MIPS_LITERAL.
+    fixup_Mips_LITERAL,
 
-        // fixup_Mips_xxx - R_MIPS_LITERAL.
-        fixup_Mips_LITERAL,
+    // Global symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT_Global,
 
-        // fixup_Mips_xxx - R_MIPS_GOT16.
-        fixup_Mips_GOT16,
+    // Local symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT_Local,
 
-        // fixup_Mips_xxx - R_MIPS_PC16.
-        fixup_Mips_PC16,
+    // PC relative branch fixup resulting in - R_MIPS_PC16.
+    fixup_Mips_PC16,
 
-        // fixup_Mips_xxx - R_MIPS_CALL16.
-        fixup_Mips_CALL16,
+    // resulting in - R_MIPS_CALL16.
+    fixup_Mips_CALL16,
 
-        // fixup_Mips_xxx - R_MIPS_GPREL32.
-        fixup_Mips_GPREL32,
+    // resulting in - R_MIPS_GPREL32.
+    fixup_Mips_GPREL32,
 
-        // fixup_Mips_xxx - R_MIPS_SHIFT5.
-        fixup_Mips_SHIFT5,
+    // resulting in - R_MIPS_SHIFT5.
+    fixup_Mips_SHIFT5,
 
-        // fixup_Mips_xxx - R_MIPS_SHIFT6.
-        fixup_Mips_SHIFT6,
+    // resulting in - R_MIPS_SHIFT6.
+    fixup_Mips_SHIFT6,
 
-        // fixup_Mips_xxx - R_MIPS_64.
-        fixup_Mips_64,
+    // Pure 64 bit data fixup resulting in - R_MIPS_64.
+    fixup_Mips_64,
 
-        // fixup_Mips_xxx - R_MIPS_TLS_GD.
-        fixup_Mips_TLSGD,
+    // resulting in - R_MIPS_TLS_GD.
+    fixup_Mips_TLSGD,
 
-        // fixup_Mips_xxx - R_MIPS_TLS_GOTTPREL.
-        fixup_Mips_GOTTPREL,
+    // resulting in - R_MIPS_TLS_GOTTPREL.
+    fixup_Mips_GOTTPREL,
 
-        // fixup_Mips_xxx - R_MIPS_TLS_TPREL_HI16.
-        fixup_Mips_TPREL_HI,
+    // resulting in - R_MIPS_TLS_TPREL_HI16.
+    fixup_Mips_TPREL_HI,
 
-        // fixup_Mips_xxx - R_MIPS_TLS_TPREL_LO16.
-        fixup_Mips_TPREL_LO,
+    // resulting in - R_MIPS_TLS_TPREL_LO16.
+    fixup_Mips_TPREL_LO,
 
-        // fixup_Mips_xxx - yyy. // This should become R_MIPS_PC16
-        fixup_Mips_Branch_PCRel,
+    // PC relative branch fixup resulting in - R_MIPS_PC16
+    fixup_Mips_Branch_PCRel,
 
-        // Marker
-        LastTargetFixupKind,
-        NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-    };
+    // Marker
+    LastTargetFixupKind,
+    NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+  };
 } // namespace Mips
 } // namespace llvm
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 0c3cbb3..463dcfe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -194,8 +194,11 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
       case MCSymbolRefExpr::VK_Mips_GOT_CALL:
         FixupKind = Mips::fixup_Mips_CALL16;
         break;
+      case MCSymbolRefExpr::VK_Mips_GOT16:
+        FixupKind = Mips::fixup_Mips_GOT_Global;
+        break;
       case MCSymbolRefExpr::VK_Mips_GOT:
-        FixupKind = Mips::fixup_Mips_GOT16;
+        FixupKind = Mips::fixup_Mips_GOT_Local;
         break;
       case MCSymbolRefExpr::VK_Mips_ABS_HI:
         FixupKind = Mips::fixup_Mips_HI16;
@@ -245,8 +248,8 @@ unsigned
 MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups) const {
   assert(MI.getOperand(OpNo).isImm());
-  unsigned szEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
-  return szEncoding - 1;
+  unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+  return SizeEncoding - 1;
 }
 
 // FIXME: should be called getMSBEncoding
@@ -256,10 +259,10 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups) const {
   assert(MI.getOperand(OpNo-1).isImm());
   assert(MI.getOperand(OpNo).isImm());
-  unsigned pos = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups);
-  unsigned sz = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+  unsigned Position = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups);
+  unsigned Size = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
 
-  return pos + sz - 1;
+  return Position + Size - 1;
 }
 
 #include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 39c2c16..e9e0f60 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -79,9 +79,9 @@ def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
-def : Proc<"mips32r1", [FeatureMips32]>;
-def : Proc<"4ke", [FeatureMips32r2]>;
-def : Proc<"mips64r1", [FeatureMips64]>;
+def : Proc<"mips32", [FeatureMips32]>;
+def : Proc<"mips32r2", [FeatureMips32r2]>;
+def : Proc<"mips64", [FeatureMips64]>;
 def : Proc<"mips64r2", [FeatureMips64r2]>;
 
 def MipsAsmWriter : AsmWriter {
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index b0fb4fa..2996986 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -25,7 +25,7 @@ def uimm16_64      : Operand<i64> {
 
 // Transformation Function - get Imm - 32.
 def Subtract32 : SDNodeXForm<imm, [{
-  return getI32Imm((unsigned)N->getZExtValue() - 32);
+  return getImm(N, (unsigned)N->getZExtValue() - 32);
 }]>;
 
 // shamt field must fit in 5 bits.
@@ -36,6 +36,19 @@ def imm32_63 : ImmLeaf<i32,
                        [{return (int32_t)Imm >= 32 && (int32_t)Imm < 64;}],
                        Subtract32>;
 
+// Is a 32-bit int.
+def immSExt32 : ImmLeaf<i64, [{return isInt<32>(Imm);}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HIGHER : SDNodeXForm<imm, [{
+  return getImm(N, (N->getZExtValue() >> 32) & 0xFFFF);
+}]>;
+
+// Transformation Function - get the highest 16 bits.
+def HIGHEST : SDNodeXForm<imm, [{
+  return getImm(N, (N->getZExtValue() >> 48) & 0xFFFF);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -206,6 +219,17 @@ let Uses = [SP_64] in
 def DynAlloc64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>,
                  Requires<[IsN64]>;
 
+def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>;
+
+def DEXT : ExtBase<3, "dext", CPU64Regs>;
+def DINS : InsBase<7, "dins", CPU64Regs>;
+
+def DSLL64_32 : FR<0x3c, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
+                   "dsll32\t$rd, $rt, 0", [], IIAlu>;
+
+def SLL64_32 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
+                  "sll\t$rd, $rt, 0", [], IIAlu>;
+
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
@@ -216,9 +240,15 @@ def : Pat<(i64 immSExt16:$in),
 def : Pat<(i64 immZExt16:$in),
           (ORi64 ZERO_64, imm:$in)>;
 
+// 32-bit immediates
+def : Pat<(i64 immSExt32:$imm),
+          (ORi64 (LUi64 (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
 // Arbitrary immediates
 def : Pat<(i64 imm:$imm),
-          (ORi64 (LUi64 (HI16 imm:$imm)), (LO16 imm:$imm))>;
+          (ORi64 (DSLL (ORi64 (DSLL (ORi64 (LUi64 (HIGHEST imm:$imm)),
+           (HIGHER imm:$imm)), 16), (HI16 imm:$imm)), 16),
+           (LO16 imm:$imm))>;
 
 // extended loads
 let Predicates = [NotN64] in {
@@ -236,11 +266,13 @@ def : Pat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
 def : Pat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
 def : Pat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
 def : Pat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
 
 def : Pat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
 def : Pat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
 def : Pat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
 def : Pat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+def : Pat<(MipsLo tglobaltlsaddr:$in), (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
 
 def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
           (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
@@ -250,6 +282,15 @@ def : Pat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
           (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
 def : Pat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
           (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
+def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+          (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>;
+
+def : WrapperPat<tglobaladdr, DADDiu, GP_64>;
+def : WrapperPat<tconstpool, DADDiu, GP_64>;
+def : WrapperPat<texternalsym, DADDiu, GP_64>;
+def : WrapperPat<tblockaddress, DADDiu, GP_64>;
+def : WrapperPat<tjumptable, DADDiu, GP_64>;
+def : WrapperPat<tglobaltlsaddr, DADDiu, GP_64>;
 
 defm : BrcondPats<CPU64Regs, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
                   ZERO_64>;
@@ -268,3 +309,6 @@ def : Pat<(MipsDynAlloc addr:$f), (DynAlloc64 addr:$f)>, Requires<[IsN64]>;
 def : Pat<(i32 (trunc CPU64Regs:$src)),
           (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>, Requires<[IsN64]>;
  
+// 32-to-64-bit extension
+def : Pat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+def : Pat<(i64 (zext CPURegs:$src)), (DSRL32 (DSLL64_32 CPURegs:$src), 0)>;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index d27e3ab..a5505d3 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -96,19 +96,17 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   if (!OutStreamer.hasRawTextSupport()) {
     // Lower CPLOAD and CPRESTORE
-    if (Opc == Mips::CPLOAD) {
+    if (Opc == Mips::CPLOAD)
       MCInstLowering.LowerCPLOAD(MI, MCInsts);
-      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I
-          != MCInsts.end(); ++I)
+    else if (Opc == Mips::CPRESTORE)
+      MCInstLowering.LowerCPRESTORE(MI, MCInsts);
+    
+    if (!MCInsts.empty()) {
+      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
+           I != MCInsts.end(); ++I)
         OutStreamer.EmitInstruction(*I);
       return;
     }
-
-    if (Opc == Mips::CPRESTORE) {
-      MCInstLowering.LowerCPRESTORE(MI, TmpInst0);
-      OutStreamer.EmitInstruction(TmpInst0);
-      return;
-    }
   }
 
   OutStreamer.EmitInstruction(TmpInst0);
@@ -317,9 +315,9 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
   // Otherwise, check the last instruction.
   // Check if the last terminator is an unconditional branch.
   MachineBasicBlock::const_iterator I = Pred->end();
-  while (I != Pred->begin() && !(--I)->getDesc().isTerminator()) ;
+  while (I != Pred->begin() && !(--I)->isTerminator()) ;
 
-  return !I->getDesc().isBarrier();
+  return !I->isBarrier();
 }
 
 // Print out an operand for an inline asm expression.
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index a8f29ae..6b26e24 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -144,7 +144,7 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
         MBB != E; ++MBB){
       MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
           I != E; ++I)
         emitInstruction(*I);
     }
@@ -161,7 +161,7 @@ unsigned MipsCodeEmitter::getRelocation(const MachineInstr &MI,
   if (Form == MipsII::FrmJ)
     return Mips::reloc_mips_26;
   if ((Form == MipsII::FrmI || Form == MipsII::FrmFI)
-       && MI.getDesc().isBranch())
+       && MI.isBranch())
     return Mips::reloc_mips_branch;
   if (Form == MipsII::FrmI && MI.getOpcode() == Mips::LUi)
     return Mips::reloc_mips_hi;
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index be3b7a0..1d9e9b0 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -96,7 +96,7 @@ runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   LastFiller = MBB.end();
 
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
-    if (I->getDesc().hasDelaySlot()) {
+    if (I->hasDelaySlot()) {
       ++FilledSlots;
       Changed = true;
 
@@ -146,7 +146,7 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
         || I->isInlineAsm()
         || I->isLabel()
         || FI == LastFiller
-        || I->getDesc().isPseudo()
+        || I->isPseudo()
         //
         // Should not allow:
         // ERET, DERET or WAIT, PAUSE. Need to add these to instruction
@@ -174,16 +174,15 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
   if (candidate->isImplicitDef() || candidate->isKill())
     return true;
 
-  MCInstrDesc MCID = candidate->getDesc();
   // Loads or stores cannot be moved past a store to the delay slot
   // and stores cannot be moved past a load. 
-  if (MCID.mayLoad()) {
+  if (candidate->mayLoad()) {
     if (sawStore)
       return true;
     sawLoad = true;
   }
 
-  if (MCID.mayStore()) {
+  if (candidate->mayStore()) {
     if (sawStore)
       return true;
     sawStore = true;
@@ -191,7 +190,7 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
       return true;
   }
 
-  assert((!MCID.isCall() && !MCID.isReturn()) &&
+  assert((!candidate->isCall() && !candidate->isReturn()) &&
          "Cannot put calls or returns in delay slot.");
 
   for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
@@ -221,11 +220,11 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
                             SmallSet<unsigned, 32>& RegUses) {
   // If MI is a call or return, just examine the explicit non-variadic operands.
   MCInstrDesc MCID = MI->getDesc();
-  unsigned e = MCID.isCall() || MCID.isReturn() ? MCID.getNumOperands() :
-                                                  MI->getNumOperands();
+  unsigned e = MI->isCall() || MI->isReturn() ? MCID.getNumOperands() :
+                                                MI->getNumOperands();
   
   // Add RA to RegDefs to prevent users of RA from going into delay slot. 
-  if (MCID.isCall())
+  if (MI->isCall())
     RegDefs.insert(Mips::RA);
 
   for (unsigned i = 0; i != e; ++i) {
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 36aef99..2466545 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -85,8 +85,8 @@ using namespace llvm;
 // if frame pointer elimination is disabled.
 bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects()
-      || MFI->isFrameAddressTaken();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+      MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
 }
 
 bool MipsFrameLowering::targetHandlesStackFrameRounding() const {
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 9c831ed..b17239d 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -86,10 +86,9 @@ private:
   // Complex Pattern.
   bool SelectAddr(SDValue N, SDValue &Base, SDValue &Offset);
 
-  // getI32Imm - Return a target constant with the specified
-  // value, of type i32.
-  inline SDValue getI32Imm(unsigned Imm) {
-    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  // getImm - Return a target constant with the specified value.
+  inline SDValue getImm(const SDNode *Node, unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
   }
 
   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
@@ -122,21 +121,16 @@ SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
   }
 
   // on PIC code Load GA
-  if (TM.getRelocationModel() == Reloc::PIC_) {
-    if (Addr.getOpcode() == MipsISD::WrapperPIC) {
-      Base   = CurDAG->getRegister(GPReg, ValTy);
-      Offset = Addr.getOperand(0);
-      return true;
-    }
-  } else {
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = CurDAG->getRegister(GPReg, ValTy);
+    Offset = Addr.getOperand(0);
+    return true;
+  }
+
+  if (TM.getRelocationModel() != Reloc::PIC_) {
     if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
-    else if (Addr.getOpcode() == ISD::TargetGlobalTLSAddress) {
-      Base   = CurDAG->getRegister(GPReg, ValTy);
-      Offset = Addr;
-      return true;
-    }
   }
 
   // Addresses of the form FI+const or FI|const
@@ -310,13 +304,24 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     }
 
     case MipsISD::ThreadPointer: {
-      unsigned SrcReg = Mips::HWR29;
-      unsigned DestReg = Mips::V1;
-      SDNode *Rdhwr = CurDAG->getMachineNode(Mips::RDHWR, Node->getDebugLoc(),
-          Node->getValueType(0), CurDAG->getRegister(SrcReg, MVT::i32));
+      EVT PtrVT = TLI.getPointerTy();
+      unsigned RdhwrOpc, SrcReg, DestReg;
+
+      if (PtrVT == MVT::i32) {
+        RdhwrOpc = Mips::RDHWR;
+        SrcReg = Mips::HWR29;
+        DestReg = Mips::V1;
+      } else {
+        RdhwrOpc = Mips::RDHWR64;
+        SrcReg = Mips::HWR29_64;
+        DestReg = Mips::V1_64;
+      }
+      
+      SDNode *Rdhwr = CurDAG->getMachineNode(RdhwrOpc, Node->getDebugLoc(),
+          Node->getValueType(0), CurDAG->getRegister(SrcReg, PtrVT));
       SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, DestReg,
           SDValue(Rdhwr, 0));
-      SDValue ResNode = CurDAG->getCopyFromReg(Chain, dl, DestReg, MVT::i32);
+      SDValue ResNode = CurDAG->getCopyFromReg(Chain, dl, DestReg, PtrVT);
       ReplaceUses(SDValue(Node, 0), ResNode);
       return ResNode.getNode();
     }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index b5a15cf..c9b657c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -40,11 +40,11 @@ using namespace llvm;
 // mask (Pos), and return true.
 // For example, if I is 0x003ff800, (Pos, Size) = (11, 11).  
 static bool IsShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
-  if (!isUInt<32>(I) || !isShiftedMask_32(I))
+  if (!isShiftedMask_64(I))
      return false;
 
-  Size = CountPopulation_32(I);
-  Pos = CountTrailingZeros_32(I);
+  Size = CountPopulation_64(I);
+  Pos = CountTrailingZeros_64(I);
   return true;
 }
 
@@ -54,9 +54,6 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Hi:                return "MipsISD::Hi";
   case MipsISD::Lo:                return "MipsISD::Lo";
   case MipsISD::GPRel:             return "MipsISD::GPRel";
-  case MipsISD::TlsGd:             return "MipsISD::TlsGd";
-  case MipsISD::TprelHi:           return "MipsISD::TprelHi";
-  case MipsISD::TprelLo:           return "MipsISD::TprelLo";
   case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
   case MipsISD::Ret:               return "MipsISD::Ret";
   case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
@@ -72,7 +69,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::DivRemU:           return "MipsISD::DivRemU";
   case MipsISD::BuildPairF64:      return "MipsISD::BuildPairF64";
   case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
-  case MipsISD::WrapperPIC:        return "MipsISD::WrapperPIC";
+  case MipsISD::Wrapper:           return "MipsISD::Wrapper";
   case MipsISD::DynAlloc:          return "MipsISD::DynAlloc";
   case MipsISD::Sync:              return "MipsISD::Sync";
   case MipsISD::Ext:               return "MipsISD::Ext";
@@ -129,7 +126,9 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
   setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
   setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
   setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
@@ -157,6 +156,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
   setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i64,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
 
@@ -555,20 +558,20 @@ static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
     return SDValue();
 
   SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
-  
+  unsigned ShiftRightOpc = ShiftRight.getOpcode();
+
   // Op's first operand must be a shift right.
-  if (ShiftRight.getOpcode() != ISD::SRA && ShiftRight.getOpcode() != ISD::SRL)
+  if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
     return SDValue();
 
   // The second operand of the shift must be an immediate.
-  uint64_t Pos;
   ConstantSDNode *CN;
   if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
     return SDValue();
   
-  Pos = CN->getZExtValue();
-
+  uint64_t Pos = CN->getZExtValue();
   uint64_t SMPos, SMSize;
+
   // Op's second operand must be a shifted mask.
   if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
       !IsShiftedMask(CN->getZExtValue(), SMPos, SMSize))
@@ -576,10 +579,11 @@ static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
 
   // Return if the shifted mask does not start at bit 0 or the sum of its size
   // and Pos exceeds the word's size.
-  if (SMPos != 0 || Pos + SMSize > 32)
+  EVT ValTy = N->getValueType(0);
+  if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
     return SDValue();
 
-  return DAG.getNode(MipsISD::Ext, N->getDebugLoc(), MVT::i32,
+  return DAG.getNode(MipsISD::Ext, N->getDebugLoc(), ValTy,
                      ShiftRight.getOperand(0),
                      DAG.getConstant(Pos, MVT::i32),
                      DAG.getConstant(SMSize, MVT::i32));
@@ -630,10 +634,11 @@ static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG,
 
   // Return if the shift amount and the first bit position of mask are not the
   // same.  
-  if (Shamt != SMPos0)
+  EVT ValTy = N->getValueType(0);
+  if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
     return SDValue();
   
-  return DAG.getNode(MipsISD::Ins, N->getDebugLoc(), MVT::i32,
+  return DAG.getNode(MipsISD::Ins, N->getDebugLoc(), ValTy,
                      Shl.getOperand(0),
                      DAG.getConstant(SMPos0, MVT::i32),
                      DAG.getConstant(SMSize0, MVT::i32),
@@ -1485,9 +1490,9 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
                      (GV->hasLocalLinkage() && !isa<Function>(GV)));
   unsigned GotFlag = IsN64 ?
                      (HasGotOfst ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT_DISP) :
-                     MipsII::MO_GOT;
+                     (HasGotOfst ? MipsII::MO_GOT : MipsII::MO_GOT16);
   SDValue GA = DAG.getTargetGlobalAddress(GV, dl, ValTy, 0, GotFlag);
-  GA = DAG.getNode(MipsISD::WrapperPIC, dl, ValTy, GA);
+  GA = DAG.getNode(MipsISD::Wrapper, dl, ValTy, GA);
   SDValue ResNode = DAG.getLoad(ValTy, dl,
                                 DAG.getEntryNode(), GA, MachinePointerInfo(),
                                 false, false, false, 0);
@@ -1523,7 +1528,7 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
   unsigned GOTFlag = IsN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
   unsigned OFSTFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
   SDValue BAGOTOffset = DAG.getBlockAddress(BA, ValTy, true, GOTFlag);
-  BAGOTOffset = DAG.getNode(MipsISD::WrapperPIC, dl, ValTy, BAGOTOffset);
+  BAGOTOffset = DAG.getNode(MipsISD::Wrapper, dl, ValTy, BAGOTOffset);
   SDValue BALOOffset = DAG.getBlockAddress(BA, ValTy, true, OFSTFlag);
   SDValue Load = DAG.getLoad(ValTy, dl,
                              DAG.getEntryNode(), BAGOTOffset,
@@ -1535,9 +1540,9 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
 SDValue MipsTargetLowering::
 LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  // If the relocation model is PIC, use the General Dynamic TLS Model,
-  // otherwise use the Initial Exec or Local Exec TLS Model.
-  // TODO: implement Local Dynamic TLS model
+  // If the relocation model is PIC, use the General Dynamic TLS Model or
+  // Local Dynamic TLS model, otherwise use the Initial Exec or
+  // Local Exec TLS Model.
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   DebugLoc dl = GA->getDebugLoc();
@@ -1546,45 +1551,59 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
     // General Dynamic TLS Model
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32,
-                                             0, MipsII::MO_TLSGD);
-    SDValue Tlsgd = DAG.getNode(MipsISD::TlsGd, dl, MVT::i32, TGA);
-    SDValue GP = DAG.getRegister(Mips::GP, MVT::i32);
-    SDValue Argument = DAG.getNode(ISD::ADD, dl, MVT::i32, GP, Tlsgd);
+    bool LocalDynamic = GV->hasInternalLinkage();
+    unsigned Flag = LocalDynamic ? MipsII::MO_TLSLDM :MipsII::MO_TLSGD;
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, Flag);
+    SDValue Argument = DAG.getNode(MipsISD::Wrapper, dl, PtrVT, TGA);
+    unsigned PtrSize = PtrVT.getSizeInBits();
+    IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
+
+    SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);
 
     ArgListTy Args;
     ArgListEntry Entry;
     Entry.Node = Argument;
-    Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
+    Entry.Ty = PtrTy;
     Args.push_back(Entry);
+    
     std::pair<SDValue, SDValue> CallResult =
-        LowerCallTo(DAG.getEntryNode(),
-                    (Type *) Type::getInt32Ty(*DAG.getContext()),
-                    false, false, false, false, 0, CallingConv::C, false, true,
-                    DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG,
-                    dl);
-
-    return CallResult.first;
+      LowerCallTo(DAG.getEntryNode(), PtrTy,
+                  false, false, false, false, 0, CallingConv::C, false, true,
+                  TlsGetAddr, Args, DAG, dl);
+
+    SDValue Ret = CallResult.first;
+
+    if (!LocalDynamic)
+      return Ret;
+
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               MipsII::MO_DTPREL_HI);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, dl, PtrVT, TGAHi);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               MipsII::MO_DTPREL_LO);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, PtrVT, TGALo);
+    SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Ret);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, Add, Lo);
   }
 
   SDValue Offset;
   if (GV->isDeclaration()) {
     // Initial Exec TLS Model
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                              MipsII::MO_GOTTPREL);
-    Offset = DAG.getLoad(MVT::i32, dl,
+    TGA = DAG.getNode(MipsISD::Wrapper, dl, PtrVT, TGA);
+    Offset = DAG.getLoad(PtrVT, dl,
                          DAG.getEntryNode(), TGA, MachinePointerInfo(),
                          false, false, false, 0);
   } else {
     // Local Exec TLS Model
-    SDVTList VTs = DAG.getVTList(MVT::i32);
-    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                MipsII::MO_TPREL_HI);
-    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                MipsII::MO_TPREL_LO);
-    SDValue Hi = DAG.getNode(MipsISD::TprelHi, dl, VTs, &TGAHi, 1);
-    SDValue Lo = DAG.getNode(MipsISD::TprelLo, dl, MVT::i32, TGALo);
-    Offset = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, dl, PtrVT, TGAHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, PtrVT, TGALo);
+    Offset = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
   }
 
   SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, dl, PtrVT);
@@ -1594,34 +1613,29 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 SDValue MipsTargetLowering::
 LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 {
-  SDValue ResNode;
-  SDValue HiPart;
+  SDValue HiPart, JTI, JTILo;
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
-  unsigned char OpFlag = IsPIC ? MipsII::MO_GOT : MipsII::MO_ABS_HI;
-
   EVT PtrVT = Op.getValueType();
-  JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
-  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
-
-  if (!IsPIC) {
-    SDValue Ops[] = { JTI };
-    HiPart = DAG.getNode(MipsISD::Hi, dl, DAG.getVTList(MVT::i32), Ops, 1);
+  if (!IsPIC && !IsN64) {
+    JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MipsII::MO_ABS_HI);
+    HiPart = DAG.getNode(MipsISD::Hi, dl, PtrVT, JTI);
+    JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MipsII::MO_ABS_LO);
   } else {// Emit Load from Global Pointer
-    JTI = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, JTI);
-    HiPart = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), JTI,
-                         MachinePointerInfo(),
-                         false, false, false, 0);
+    unsigned GOTFlag = IsN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+    unsigned OfstFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+    JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, GOTFlag);
+    JTI = DAG.getNode(MipsISD::Wrapper, dl, PtrVT, JTI);
+    HiPart = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), JTI,
+                         MachinePointerInfo(), false, false, false, 0);
+    JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OfstFlag);
   }
 
-  SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                         MipsII::MO_ABS_LO);
-  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTILo);
-  ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
-
-  return ResNode;
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, PtrVT, JTILo);
+  return DAG.getNode(ISD::ADD, dl, PtrVT, HiPart, Lo);
 }
 
 SDValue MipsTargetLowering::
@@ -1657,7 +1671,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
     unsigned OFSTFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
     SDValue CP = DAG.getTargetConstantPool(C, ValTy, N->getAlignment(),
                                            N->getOffset(), GOTFlag);
-    CP = DAG.getNode(MipsISD::WrapperPIC, dl, ValTy, CP);
+    CP = DAG.getNode(MipsISD::Wrapper, dl, ValTy, CP);
     SDValue Load = DAG.getLoad(ValTy, dl, DAG.getEntryNode(),
                                CP, MachinePointerInfo::getConstantPool(),
                                false, false, false, 0);
@@ -1685,21 +1699,29 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV),
                       false, false, 0);
 }
-
-static SDValue LowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG) {
+ 
+// Called if the size of integer registers is large enough to hold the whole
+// floating point number.
+static SDValue LowerFCOPYSIGNLargeIntReg(SDValue Op, SelectionDAG &DAG) {
   // FIXME: Use ext/ins instructions if target architecture is Mips32r2.
+  EVT ValTy = Op.getValueType();
+  EVT IntValTy = MVT::getIntegerVT(ValTy.getSizeInBits());
+  uint64_t Mask = (uint64_t)1 << (ValTy.getSizeInBits() - 1);
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(0));
-  SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(1));
-  SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op0,
-                             DAG.getConstant(0x7fffffff, MVT::i32));
-  SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op1,
-                             DAG.getConstant(0x80000000, MVT::i32));
-  SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1);
-  return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Result);
+  SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntValTy, Op.getOperand(0));
+  SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntValTy, Op.getOperand(1));
+  SDValue And0 = DAG.getNode(ISD::AND, dl, IntValTy, Op0,
+                             DAG.getConstant(Mask - 1, IntValTy));
+  SDValue And1 = DAG.getNode(ISD::AND, dl, IntValTy, Op1,
+                             DAG.getConstant(Mask, IntValTy));
+  SDValue Result = DAG.getNode(ISD::OR, dl, IntValTy, And0, And1);
+  return DAG.getNode(ISD::BITCAST, dl, ValTy, Result);
 }
 
-static SDValue LowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool isLittle) {
+// Called if the size of integer registers is not large enough to hold the whole
+// floating point number (e.g. f64 & 32-bit integer register).
+static SDValue
+LowerFCOPYSIGNSmallIntReg(SDValue Op, SelectionDAG &DAG, bool isLittle) {
   // FIXME:
   //  Use ext/ins instructions if target architecture is Mips32r2.
   //  Eliminate redundant mfc1 and mtc1 instructions.
@@ -1734,10 +1756,10 @@ SDValue MipsTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
 
   assert(Ty == MVT::f32 || Ty == MVT::f64);
 
-  if (Ty == MVT::f32)
-    return LowerFCOPYSIGN32(Op, DAG);
+  if (Ty == MVT::f32 || HasMips64)
+    return LowerFCOPYSIGNLargeIntReg(Op, DAG);
   else
-    return LowerFCOPYSIGN64(Op, DAG, Subtarget->isLittle());
+    return LowerFCOPYSIGNSmallIntReg(Op, DAG, Subtarget->isLittle());
 }
 
 SDValue MipsTargetLowering::
@@ -2328,7 +2350,7 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
   // node so that legalize doesn't hack it.
   unsigned char OpFlag;
   bool IsPICCall = (IsN64 || IsPIC); // true if calls are translated to jalr $25
-  bool LoadSymAddr = false;
+  bool GlobalOrExternal = false;
   SDValue CalleeLo;
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
@@ -2345,7 +2367,7 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
                                           getPointerTy(), 0, OpFlag);
     }
 
-    LoadSymAddr = true;
+    GlobalOrExternal = true;
   }
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (IsN64 || (!IsO32 && IsPIC))
@@ -2356,16 +2378,16 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
       OpFlag = MipsII::MO_GOT_CALL;
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(),
                                          getPointerTy(), OpFlag);
-    LoadSymAddr = true;
+    GlobalOrExternal = true;
   }
 
   SDValue InFlag;
 
   // Create nodes that load address of callee and copy it to T9
   if (IsPICCall) {
-    if (LoadSymAddr) {
+    if (GlobalOrExternal) {
       // Load callee address
-      Callee = DAG.getNode(MipsISD::WrapperPIC, dl, getPointerTy(), Callee);
+      Callee = DAG.getNode(MipsISD::Wrapper, dl, getPointerTy(), Callee);
       SDValue LoadValue = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                                       Callee, MachinePointerInfo::getGOT(),
                                       false, false, false, 0);
@@ -2377,7 +2399,11 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
       } else
         Callee = LoadValue;
     }
+  }
 
+  // T9 should contain the address of the callee function if 
+  // -reloction-model=pic or it is an indirect call.
+  if (IsPICCall || !GlobalOrExternal) {
     // copy to T9
     unsigned T9Reg = IsN64 ? Mips::T9_64 : Mips::T9;
     Chain = DAG.getCopyToReg(Chain, dl, T9Reg, Callee, SDValue(0, 0));
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index f2b64e3..81d093f 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -40,13 +40,6 @@ namespace llvm {
       // Handle gp_rel (small data/bss sections) relocation.
       GPRel,
 
-      // General Dynamic TLS
-      TlsGd,
-
-      // Local Exec TLS
-      TprelHi,
-      TprelLo,
-
       // Thread Pointer
       ThreadPointer,
 
@@ -79,7 +72,7 @@ namespace llvm {
       BuildPairF64,
       ExtractElementF64,
 
-      WrapperPIC,
+      Wrapper,
 
       DynAlloc,
 
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index e1725fa..21a1862 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -115,7 +115,7 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
   let Inst{15-0}  = imm16;
 }
 
-class CBranchBase<bits<6> op, dag outs, dag ins, string asmstr,
+class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
                   list<dag> pattern, InstrItinClass itin>:
   MipsInst<outs, ins, asmstr, pattern, itin, FrmI>
 {
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 5358dc0..ea101f7 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -29,8 +29,8 @@ using namespace llvm;
 MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
     TM(tm), IsN64(TM.getSubtarget<MipsSubtarget>().isABI_N64()),
-    RI(*TM.getSubtargetImpl(), *this) {}
-
+    RI(*TM.getSubtargetImpl(), *this),
+    UncondBrOpc(TM.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J) {}
 
 const MipsRegisterInfo &MipsInstrInfo::getRegisterInfo() const { 
   return RI;
@@ -236,7 +236,8 @@ static unsigned GetAnalyzableBrOpc(unsigned Opc) {
           Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
           Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
           Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
-          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::J) ?
+          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::B      ||
+          Opc == Mips::J) ?
          Opc : 0;
 }
 
@@ -320,7 +321,7 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   // If there is only one terminator instruction, process it.
   if (!SecondLastOpc) {
     // Unconditional branch
-    if (LastOpc == Mips::J) {
+    if (LastOpc == UncondBrOpc) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
     }
@@ -337,7 +338,7 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // If second to last instruction is an unconditional branch,
   // analyze it and remove the last instruction.
-  if (SecondLastOpc == Mips::J) {
+  if (SecondLastOpc == UncondBrOpc) {
     // Return if the last instruction cannot be removed.
     if (!AllowModify)
       return true;
@@ -349,7 +350,7 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // Conditional branch followed by an unconditional branch.
   // The last one must be unconditional.
-  if (LastOpc != Mips::J)
+  if (LastOpc != UncondBrOpc)
     return true;
 
   AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
@@ -391,14 +392,14 @@ InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   // Two-way Conditional branch.
   if (FBB) {
     BuildCondBr(MBB, TBB, DL, Cond);
-    BuildMI(&MBB, DL, get(Mips::J)).addMBB(FBB);
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(FBB);
     return 2;
   }
 
   // One way branch.
   // Unconditional branch.
   if (Cond.empty())
-    BuildMI(&MBB, DL, get(Mips::J)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(TBB);
   else // Conditional branch.
     BuildCondBr(MBB, TBB, DL, Cond);
   return 1;
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 8fa3052..70cc2cf 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -34,6 +34,7 @@ class MipsInstrInfo : public MipsGenInstrInfo {
   MipsTargetMachine &TM;
   bool IsN64;
   const MipsRegisterInfo RI;
+  unsigned UncondBrOpc;
 public:
   explicit MipsInstrInfo(MipsTargetMachine &TM);
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 0ae94ab..9fcc5fd 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -107,7 +107,7 @@ def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsDivRem,
 //  movn  %got(d)($gp), %got(c)($gp), $4
 // This instruction is illegal since movn can take only register operands.
 
-def MipsWrapperPIC    : SDNode<"MipsISD::WrapperPIC",  SDTIntUnaryOp>;
+def MipsWrapper    : SDNode<"MipsISD::Wrapper",  SDTIntUnaryOp>;
 
 // Pointer to dynamically allocated stack area.
 def MipsDynAlloc  : SDNode<"MipsISD::DynAlloc", SDT_MipsDynAlloc,
@@ -132,6 +132,8 @@ def NotMips64    : Predicate<"!Subtarget.hasMips64()">;
 def HasMips64r2  : Predicate<"Subtarget.hasMips64r2()">;
 def IsN64       : Predicate<"Subtarget.isABI_N64()">;
 def NotN64      : Predicate<"!Subtarget.isABI_N64()">;
+def RelocStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def RelocPIC    : Predicate<"TM.getRelocationModel() == Reloc::PIC_">;
 
 //===----------------------------------------------------------------------===//
 // Mips Operand, Complex Patterns and Transformations Definitions.
@@ -194,12 +196,12 @@ def size_ins : Operand<i32> {
 
 // Transformation Function - get the lower 16 bits.
 def LO16 : SDNodeXForm<imm, [{
-  return getI32Imm((unsigned)N->getZExtValue() & 0xFFFF);
+  return getImm(N, N->getZExtValue() & 0xFFFF);
 }]>;
 
 // Transformation Function - get the higher 16 bits.
 def HI16 : SDNodeXForm<imm, [{
-  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+  return getImm(N, (N->getZExtValue() >> 16) & 0xFFFF);
 }]>;
 
 // Node immediate fits as 16-bit sign extended on target immediate.
@@ -380,21 +382,13 @@ class StoreM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
   let isPseudo = Pseudo;
 }
 
-// Memory Load/Store
+// Unaligned Memory Load/Store
 let canFoldAsLoad = 1 in
-class LoadX<bits<6> op, RegisterClass RC,
-            Operand MemOpnd>:
-  FMem<op, (outs RC:$rt), (ins MemOpnd:$addr),
-     "",
-     [], IILoad> {
-}
+class LoadUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>:
+  FMem<op, (outs RC:$rt), (ins MemOpnd:$addr), "", [], IILoad> {}
 
-class StoreX<bits<6> op, RegisterClass RC,
-             Operand MemOpnd>:
-  FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr),
-     "",
-     [], IIStore> {
-}
+class StoreUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>:
+  FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr), "", [], IIStore> {}
 
 // 32-bit load.
 multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
@@ -415,10 +409,10 @@ multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode,
 } 
 
 // 32-bit load.
-multiclass LoadX32<bits<6> op> {
-  def #NAME# : LoadX<op, CPURegs, mem>,
+multiclass LoadUnAlign32<bits<6> op> {
+  def #NAME# : LoadUnAlign<op, CPURegs, mem>,
                Requires<[NotN64]>;
-  def _P8    : LoadX<op, CPURegs, mem64>,
+  def _P8    : LoadUnAlign<op, CPURegs, mem64>,
                Requires<[IsN64]>;
 }
 // 32-bit store.
@@ -440,18 +434,18 @@ multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode,
 }
 
 // 32-bit store.
-multiclass StoreX32<bits<6> op> {
-  def #NAME# : StoreX<op, CPURegs, mem>,
+multiclass StoreUnAlign32<bits<6> op> {
+  def #NAME# : StoreUnAlign<op, CPURegs, mem>,
                Requires<[NotN64]>;
-  def _P8    : StoreX<op, CPURegs, mem64>,
+  def _P8    : StoreUnAlign<op, CPURegs, mem64>,
                Requires<[IsN64]>;
 }
 
 // Conditional Branch
 class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>:
-  CBranchBase<op, (outs), (ins RC:$rs, RC:$rt, brtarget:$imm16),
-              !strconcat(instr_asm, "\t$rs, $rt, $imm16"),
-              [(brcond (i32 (cond_op RC:$rs, RC:$rt)), bb:$imm16)], IIBranch> {
+  BranchBase<op, (outs), (ins RC:$rs, RC:$rt, brtarget:$imm16),
+             !strconcat(instr_asm, "\t$rs, $rt, $imm16"),
+             [(brcond (i32 (cond_op RC:$rs, RC:$rt)), bb:$imm16)], IIBranch> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -459,9 +453,9 @@ class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>:
 
 class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op,
                   RegisterClass RC>:
-  CBranchBase<op, (outs), (ins RC:$rs, brtarget:$imm16),
-              !strconcat(instr_asm, "\t$rs, $imm16"),
-              [(brcond (i32 (cond_op RC:$rs, 0)), bb:$imm16)], IIBranch> {
+  BranchBase<op, (outs), (ins RC:$rs, brtarget:$imm16),
+             !strconcat(instr_asm, "\t$rs, $imm16"),
+             [(brcond (i32 (cond_op RC:$rs, 0)), bb:$imm16)], IIBranch> {
   let rt = _rt;
   let isBranch = 1;
   let isTerminator = 1;
@@ -485,11 +479,29 @@ class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op, Operand Od,
      [(set CPURegs:$rt, (cond_op RC:$rs, imm_type:$imm16))],
      IIAlu>;
 
-// Unconditional branch
-let isBranch=1, isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+// Jump
 class JumpFJ<bits<6> op, string instr_asm>:
   FJ<op, (outs), (ins jmptarget:$target),
-     !strconcat(instr_asm, "\t$target"), [(br bb:$target)], IIBranch>;
+     !strconcat(instr_asm, "\t$target"), [(br bb:$target)], IIBranch> {
+  let isBranch=1;
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let Predicates = [RelocStatic];  
+}
+
+// Unconditional branch
+class UncondBranch<bits<6> op, string instr_asm>:
+  BranchBase<op, (outs), (ins brtarget:$imm16),
+             !strconcat(instr_asm, "\t$imm16"), [(br bb:$imm16)], IIBranch> {
+  let rs = 0;
+  let rt = 0;
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let Predicates = [RelocPIC];  
+}
 
 let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1,
     isIndirectBranch = 1 in
@@ -616,21 +628,37 @@ class ByteSwap<bits<6> func, bits<5> sa, string instr_asm>:
 }
 
 // Read Hardware
-class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$rt), (ins HWRegs:$rd),
-    "rdhwr\t$rt, $rd", [], IIAlu> {
+class ReadHardware<RegisterClass CPURegClass, RegisterClass HWRegClass>
+  : FR<0x1f, 0x3b, (outs CPURegClass:$rt), (ins HWRegClass:$rd),
+       "rdhwr\t$rt, $rd", [], IIAlu> {
   let rs = 0;
   let shamt = 0;
 }
 
 // Ext and Ins
-class ExtIns<bits<6> _funct, string instr_asm, dag outs, dag ins,
-             list<dag> pattern, InstrItinClass itin>:
-  FR<0x1f, _funct, outs, ins, !strconcat(instr_asm, " $rt, $rs, $pos, $sz"),
-     pattern, itin>, Requires<[HasMips32r2]> {
+class ExtBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
+  FR<0x1f, _funct, (outs RC:$rt), (ins RC:$rs, uimm16:$pos, size_ext:$sz), 
+     !strconcat(instr_asm, " $rt, $rs, $pos, $sz"),
+     [(set RC:$rt, (MipsExt RC:$rs, imm:$pos, imm:$sz))], NoItinerary> {
   bits<5> pos;
   bits<5> sz;
   let rd = sz;
   let shamt = pos;
+  let Predicates = [HasMips32r2];
+}
+
+class InsBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
+  FR<0x1f, _funct, (outs RC:$rt),
+     (ins RC:$rs, uimm16:$pos, size_ins:$sz, RC:$src),
+     !strconcat(instr_asm, " $rt, $rs, $pos, $sz"),
+     [(set RC:$rt, (MipsIns RC:$rs, imm:$pos, imm:$sz, RC:$src))],
+     NoItinerary> {
+  bits<5> pos;
+  bits<5> sz;
+  let rd = sz;
+  let shamt = pos;
+  let Predicates = [HasMips32r2];
+  let Constraints = "$src = $rt";
 }
 
 // Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
@@ -795,10 +823,10 @@ defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
 defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
 
 /// Primitives for unaligned
-defm LWL     : LoadX32<0x22>;
-defm LWR     : LoadX32<0x26>;
-defm SWL     : StoreX32<0x2A>;
-defm SWR     : StoreX32<0x2E>;
+defm LWL     : LoadUnAlign32<0x22>;
+defm LWR     : LoadUnAlign32<0x26>;
+defm SWL     : StoreUnAlign32<0x2A>;
+defm SWR     : StoreUnAlign32<0x2E>;
 
 let hasSideEffects = 1 in
 def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
@@ -822,6 +850,7 @@ def J       : JumpFJ<0x02, "j">;
 def JR      : JumpFR<0x00, 0x08, "jr", CPURegs>;
 def JAL     : JumpLink<0x03, "jal">;
 def JALR    : JumpLinkReg<0x00, 0x09, "jalr">;
+def B       : UncondBranch<0x04, "b">;
 def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
 def BNE     : CBranch<0x05, "bne", setne, CPURegs>;
 def BGEZ    : CBranchZero<0x01, 1, "bgez", setge, CPURegs>;
@@ -888,21 +917,10 @@ def MSUBU : MArithR<5, "msubu", MipsMSubu>;
 def MUL   : ArithLogicR<0x1c, 0x02, "mul", mul, IIImul, CPURegs, 1>,
             Requires<[HasMips32]>;
 
-def RDHWR : ReadHardware;
-
-def EXT : ExtIns<0, "ext", (outs CPURegs:$rt),
-                 (ins CPURegs:$rs, uimm16:$pos, size_ext:$sz),
-                 [(set CPURegs:$rt,
-                   (MipsExt CPURegs:$rs, immZExt5:$pos, immZExt5:$sz))],
-                 NoItinerary>;
+def RDHWR : ReadHardware<CPURegs, HWRegs>;
 
-let Constraints = "$src = $rt" in
-def INS : ExtIns<4, "ins", (outs CPURegs:$rt),
-                 (ins CPURegs:$rs, uimm16:$pos, size_ins:$sz, CPURegs:$src),
-                 [(set CPURegs:$rt,
-                   (MipsIns CPURegs:$rs, immZExt5:$pos, immZExt5:$sz,
-                    CPURegs:$src))],
-                 NoItinerary>;
+def EXT : ExtBase<0, "ext", CPURegs>;
+def INS : InsBase<4, "ins", CPURegs>;
 
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
@@ -939,11 +957,13 @@ def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
 def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
 def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
 def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
 
 def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
 def : Pat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
 def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
 def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+def : Pat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
 
 def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
           (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
@@ -953,6 +973,8 @@ def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
           (ADDiu CPURegs:$hi, tjumptable:$lo)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
           (ADDiu CPURegs:$hi, tconstpool:$lo)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+          (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
 
 // gp_rel relocs
 def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
@@ -960,26 +982,17 @@ def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
 def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
           (ADDiu CPURegs:$gp, tconstpool:$in)>;
 
-// tlsgd
-def : Pat<(add CPURegs:$gp, (MipsTlsGd tglobaltlsaddr:$in)),
-          (ADDiu CPURegs:$gp, tglobaltlsaddr:$in)>;
-
-// tprel hi/lo
-def : Pat<(MipsTprelHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
-def : Pat<(MipsTprelLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
-def : Pat<(add CPURegs:$hi, (MipsTprelLo tglobaltlsaddr:$lo)),
-          (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
-
 // wrapper_pic
-class WrapperPICPat<SDNode node>:
-      Pat<(MipsWrapperPIC node:$in),
-          (ADDiu GP, node:$in)>;
-
-def : WrapperPICPat<tglobaladdr>;
-def : WrapperPICPat<tconstpool>;
-def : WrapperPICPat<texternalsym>;
-def : WrapperPICPat<tblockaddress>;
-def : WrapperPICPat<tjumptable>;
+class WrapperPat<SDNode node, Instruction ADDiuOp, Register GPReg>:
+      Pat<(MipsWrapper node:$in),
+          (ADDiuOp GPReg, node:$in)>;
+
+def : WrapperPat<tglobaladdr, ADDiu, GP>;
+def : WrapperPat<tconstpool, ADDiu, GP>;
+def : WrapperPat<texternalsym, ADDiu, GP>;
+def : WrapperPat<tblockaddress, ADDiu, GP>;
+def : WrapperPat<tjumptable, ADDiu, GP>;
+def : WrapperPat<tglobaltlsaddr, ADDiu, GP>;
 
 // Mips does not have "not", so we expand our way
 def : Pat<(not CPURegs:$in),
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 6fc2af1..23486d3 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -41,10 +41,14 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case MipsII::MO_NO_FLAG:  Kind = MCSymbolRefExpr::VK_None; break;
   case MipsII::MO_GPREL:    Kind = MCSymbolRefExpr::VK_Mips_GPREL; break;
   case MipsII::MO_GOT_CALL: Kind = MCSymbolRefExpr::VK_Mips_GOT_CALL; break;
+  case MipsII::MO_GOT16:    Kind = MCSymbolRefExpr::VK_Mips_GOT16; break;
   case MipsII::MO_GOT:      Kind = MCSymbolRefExpr::VK_Mips_GOT; break;
   case MipsII::MO_ABS_HI:   Kind = MCSymbolRefExpr::VK_Mips_ABS_HI; break;
   case MipsII::MO_ABS_LO:   Kind = MCSymbolRefExpr::VK_Mips_ABS_LO; break;
   case MipsII::MO_TLSGD:    Kind = MCSymbolRefExpr::VK_Mips_TLSGD; break;
+  case MipsII::MO_TLSLDM:   Kind = MCSymbolRefExpr::VK_Mips_TLSLDM; break;
+  case MipsII::MO_DTPREL_HI:Kind = MCSymbolRefExpr::VK_Mips_DTPREL_HI; break;
+  case MipsII::MO_DTPREL_LO:Kind = MCSymbolRefExpr::VK_Mips_DTPREL_LO; break;
   case MipsII::MO_GOTTPREL: Kind = MCSymbolRefExpr::VK_Mips_GOTTPREL; break;
   case MipsII::MO_TPREL_HI: Kind = MCSymbolRefExpr::VK_Mips_TPREL_HI; break;
   case MipsII::MO_TPREL_LO: Kind = MCSymbolRefExpr::VK_Mips_TPREL_LO; break;
@@ -136,14 +140,35 @@ void MipsMCInstLower::LowerCPLOAD(const MachineInstr *MI,
 }
 
 // Lower ".cprestore offset" to "sw $gp, offset($sp)".
-void MipsMCInstLower::LowerCPRESTORE(const MachineInstr *MI, MCInst &OutMI) {
-  OutMI.clear();
-  OutMI.setOpcode(Mips::SW);
-  OutMI.addOperand(MCOperand::CreateReg(Mips::GP));
-  OutMI.addOperand(MCOperand::CreateReg(Mips::SP));
+void MipsMCInstLower::LowerCPRESTORE(const MachineInstr *MI,
+                                     SmallVector<MCInst, 4>& MCInsts) {
   const MachineOperand &MO = MI->getOperand(0);
   assert(MO.isImm() && "CPRESTORE's operand must be an immediate.");
-  OutMI.addOperand(MCOperand::CreateImm(MO.getImm()));
+  unsigned Offset = MO.getImm(), Reg = Mips::SP;
+  MCInst Sw;
+
+  if (Offset >= 0x8000) {
+    unsigned Hi = (Offset >> 16) + ((Offset & 0x8000) != 0); 
+    Offset &= 0xffff;
+    Reg = Mips::AT;
+
+    // lui   at,hi
+    // addu  at,at,sp
+    MCInsts.resize(2);
+    MCInsts[0].setOpcode(Mips::LUi);
+    MCInsts[0].addOperand(MCOperand::CreateReg(Mips::AT));
+    MCInsts[0].addOperand(MCOperand::CreateImm(Hi));
+    MCInsts[1].setOpcode(Mips::ADDu);
+    MCInsts[1].addOperand(MCOperand::CreateReg(Mips::AT));
+    MCInsts[1].addOperand(MCOperand::CreateReg(Mips::AT));
+    MCInsts[1].addOperand(MCOperand::CreateReg(Mips::SP));
+  }
+  
+  Sw.setOpcode(Mips::SW);
+  Sw.addOperand(MCOperand::CreateReg(Mips::GP));
+  Sw.addOperand(MCOperand::CreateReg(Reg));
+  Sw.addOperand(MCOperand::CreateImm(Offset));
+  MCInsts.push_back(Sw);
 }
 
 MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO,
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 98e37e4..1490c14 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -36,7 +36,7 @@ public:
                   MipsAsmPrinter &asmprinter);  
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
   void LowerCPLOAD(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
-  void LowerCPRESTORE(const MachineInstr *MI, MCInst &OutMI);
+  void LowerCPRESTORE(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts); 
   void LowerUnalignedLoadStore(const MachineInstr *MI,
 		                           SmallVector<MCInst, 4>& MCInsts);
 private:
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 06c4a66..e5a0f08 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -125,6 +125,7 @@ getRegisterNumbering(unsigned RegEnum)
   case Mips::D14:
     return 28;
   case Mips::SP: case Mips::SP_64: case Mips::F29: case Mips::D29_64:
+  case Mips::HWR29:
     return 29;
   case Mips::FP: case Mips::FP_64: case Mips::F30: case Mips::D30_64:
   case Mips::D15: 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 925ad9e..76ee2e6 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -239,6 +239,7 @@ let Namespace = "Mips" in {
 
   // Hardware register $29
   def HWR29 : Register<"29">;
+  def HWR29_64 : Register<"29">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -301,3 +302,5 @@ def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)> {
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
+def HWRegs64 : RegisterClass<"Mips", [i64], 32, (add HWR29_64)>;
+
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 016d449..dc299f2 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -31,7 +31,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
-    CPUName = "mips32r1";
+    CPUName = "mips32";
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 5d6b24f..02887fa 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -34,51 +34,51 @@ extern "C" void LLVMInitializeMipsTarget() {
 // Using CodeModel::Large enables different CALL behavior.
 MipsTargetMachine::
 MipsTargetMachine(const Target &T, StringRef TT,
-                  StringRef CPU, StringRef FS,
+                  StringRef CPU, StringRef FS, const TargetOptions &Options,
                   Reloc::Model RM, CodeModel::Model CM,
                   CodeGenOpt::Level OL,
-                  bool isLittle):
-  LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
-  Subtarget(TT, CPU, FS, isLittle),
-  DataLayout(isLittle ?
-             (Subtarget.isABI_N64() ?
-              "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
-              "e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
-             (Subtarget.isABI_N64() ?
-              "E-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
-              "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
-  InstrInfo(*this),
-  FrameLowering(Subtarget),
-  TLInfo(*this), TSInfo(*this), JITInfo() {
+                  bool isLittle)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, isLittle),
+    DataLayout(isLittle ?
+               (Subtarget.isABI_N64() ?
+                "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
+                "e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
+               (Subtarget.isABI_N64() ?
+                "E-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
+                "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
+    InstrInfo(*this),
+    FrameLowering(Subtarget),
+    TLInfo(*this), TSInfo(*this), JITInfo() {
 }
 
 MipsebTargetMachine::
 MipsebTargetMachine(const Target &T, StringRef TT,
-                    StringRef CPU, StringRef FS,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
                     Reloc::Model RM, CodeModel::Model CM,
-                    CodeGenOpt::Level OL) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {}
+                    CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 MipselTargetMachine::
 MipselTargetMachine(const Target &T, StringRef TT,
-                    StringRef CPU, StringRef FS,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
                     Reloc::Model RM, CodeModel::Model CM,
-                    CodeGenOpt::Level OL) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {}
+                    CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 Mips64ebTargetMachine::
 Mips64ebTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
-                      CodeGenOpt::Level OL) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {}
+                      CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 Mips64elTargetMachine::
 Mips64elTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
-                      CodeGenOpt::Level OL) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {}
+                      CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
@@ -120,4 +120,3 @@ bool MipsTargetMachine::addCodeEmitter(PassManagerBase &PM,
   PM.add(createMipsJITCodeEmitterPass(*this, JCE));
   return false;
 }
-
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index e40d9e2..6842373 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -38,7 +38,7 @@ namespace llvm {
 
   public:
     MipsTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL,
                       bool isLittle);
@@ -82,7 +82,7 @@ namespace llvm {
 class MipsebTargetMachine : public MipsTargetMachine {
 public:
   MipsebTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
 };
@@ -92,7 +92,7 @@ public:
 class MipselTargetMachine : public MipsTargetMachine {
 public:
   MipselTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
 };
@@ -103,6 +103,7 @@ class Mips64ebTargetMachine : public MipsTargetMachine {
 public:
   Mips64ebTargetMachine(const Target &T, StringRef TT,
                         StringRef CPU, StringRef FS,
+                        const TargetOptions &Options,
                         Reloc::Model RM, CodeModel::Model CM,
                         CodeGenOpt::Level OL);
 };
@@ -113,6 +114,7 @@ class Mips64elTargetMachine : public MipsTargetMachine {
 public:
   Mips64elTargetMachine(const Target &T, StringRef TT,
                         StringRef CPU, StringRef FS,
+                        const TargetOptions &Options,
                         Reloc::Model RM, CodeModel::Model CM,
                         CodeGenOpt::Level OL);
 };
diff --git a/lib/Target/Mips/TargetInfo/CMakeLists.txt b/lib/Target/Mips/TargetInfo/CMakeLists.txt
index 5692604..4172d00 100644
--- a/lib/Target/Mips/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Mips/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMMipsInfo
   MipsTargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMMipsInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMMipsInfo MipsCommonTableGen)
diff --git a/lib/Target/Mips/TargetInfo/LLVMBuild.txt b/lib/Target/Mips/TargetInfo/LLVMBuild.txt
index 90ae260..2d42568 100644
--- a/lib/Target/Mips/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/Mips/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = MipsInfo
 parent = Mips
 required_libraries = MC Support Target
 add_to_library_groups = Mips
-
diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt
index 6709c1b..a9f4330 100644
--- a/lib/Target/PTX/CMakeLists.txt
+++ b/lib/Target/PTX/CMakeLists.txt
@@ -25,20 +25,6 @@ add_llvm_target(PTXCodeGen
   PTXTargetMachine.cpp
   )
 
-add_llvm_library_dependencies(LLVMPTXCodeGen
-  LLVMAnalysis
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMPTXDesc
-  LLVMPTXInfo
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  LLVMTransformUtils
-  )
-
 add_subdirectory(TargetInfo)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PTX/InstPrinter/CMakeLists.txt b/lib/Target/PTX/InstPrinter/CMakeLists.txt
index 029d060..b252893 100644
--- a/lib/Target/PTX/InstPrinter/CMakeLists.txt
+++ b/lib/Target/PTX/InstPrinter/CMakeLists.txt
@@ -6,8 +6,3 @@ add_llvm_library(LLVMPTXAsmPrinter
 
 add_dependencies(LLVMPTXAsmPrinter PTXCommonTableGen)
 
-add_llvm_library_dependencies(LLVMPTXAsmPrinter
-  LLVMMC
-  LLVMSupport
-  )
-
diff --git a/lib/Target/PTX/InstPrinter/LLVMBuild.txt b/lib/Target/PTX/InstPrinter/LLVMBuild.txt
index be89c10..af5d200 100644
--- a/lib/Target/PTX/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/PTX/InstPrinter/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = PTXAsmPrinter
 parent = PTX
 required_libraries = MC Support
 add_to_library_groups = PTX
-
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
index 2f6c92d..5fecb85 100644
--- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
+++ b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
@@ -38,7 +38,50 @@ StringRef PTXInstPrinter::getOpcodeName(unsigned Opcode) const {
 }
 
 void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  OS << getRegisterName(RegNo);
+  // Decode the register number into type and offset
+  unsigned RegSpace  = RegNo & 0x7;
+  unsigned RegType   = (RegNo >> 3) & 0x7;
+  unsigned RegOffset = RegNo >> 6;
+
+  // Print the register
+  OS << "%";
+
+  switch (RegSpace) {
+  default:
+    llvm_unreachable("Unknown register space!");
+  case PTXRegisterSpace::Reg:
+    switch (RegType) {
+    default:
+      llvm_unreachable("Unknown register type!");
+    case PTXRegisterType::Pred:
+      OS << "p";
+      break;
+    case PTXRegisterType::B16:
+      OS << "rh";
+      break;
+    case PTXRegisterType::B32:
+      OS << "r";
+      break;
+    case PTXRegisterType::B64:
+      OS << "rd";
+      break;
+    case PTXRegisterType::F32:
+      OS << "f";
+      break;
+    case PTXRegisterType::F64:
+      OS << "fd";
+      break;
+    }
+    break;
+  case PTXRegisterSpace::Return:
+    OS << "ret";
+    break;
+  case PTXRegisterSpace::Argument:
+    OS << "arg";
+    break;
+  }
+
+  OS << RegOffset;
 }
 
 void PTXInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
@@ -139,6 +182,8 @@ void PTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     } else {
       O << "0000000000000000";
     }
+  } else if (Op.isReg()) {
+    printRegName(O, Op.getReg());
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     const MCExpr *Expr = Op.getExpr();
diff --git a/lib/Target/PTX/LLVMBuild.txt b/lib/Target/PTX/LLVMBuild.txt
index 22c70de..15a1eb5 100644
--- a/lib/Target/PTX/LLVMBuild.txt
+++ b/lib/Target/PTX/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = InstPrinter MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = PTX
@@ -27,4 +30,3 @@ name = PTXCodeGen
 parent = PTX
 required_libraries = Analysis AsmPrinter CodeGen Core MC PTXDesc PTXInfo SelectionDAG Support Target TransformUtils
 add_to_library_groups = PTX
-
diff --git a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
index 94dbcee..d1fd74c 100644
--- a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
@@ -3,11 +3,4 @@ add_llvm_library(LLVMPTXDesc
   PTXMCAsmInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMPTXDesc
-  LLVMMC
-  LLVMPTXAsmPrinter
-  LLVMPTXInfo
-  LLVMSupport
-  )
-
 add_dependencies(LLVMPTXDesc PTXCommonTableGen)
diff --git a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt b/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt
index fff21c1..19b80c5 100644
--- a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = PTXDesc
 parent = PTX
 required_libraries = MC PTXAsmPrinter PTXInfo Support
 add_to_library_groups = PTX
-
diff --git a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
index c6094be..77a298d 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
+++ b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
@@ -17,6 +17,8 @@
 #ifndef PTXBASEINFO_H
 #define PTXBASEINFO_H
 
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "PTXMCTargetDesc.h"
 
 namespace llvm {
@@ -57,6 +59,75 @@ namespace llvm {
       RndPosInfInt        = 10  // .rpi
     };
   } // namespace PTXII
+
+  namespace PTXRegisterType {
+    // Register type encoded in MCOperands
+    enum {
+      Pred  = 0,
+      B16,
+      B32,
+      B64,
+      F32,
+      F64
+    };
+  } // namespace PTXRegisterType
+
+  namespace PTXRegisterSpace {
+    // Register space encoded in MCOperands
+    enum {
+      Reg = 0,
+      Local,
+      Param,
+      Argument,
+      Return
+    };
+  }
+
+  inline static void decodeRegisterName(raw_ostream &OS,
+                                        unsigned EncodedReg) {
+    OS << "%";
+
+    unsigned RegSpace  = EncodedReg & 0x7;
+    unsigned RegType   = (EncodedReg >> 3) & 0x7;
+    unsigned RegOffset = EncodedReg >> 6;
+
+    switch (RegSpace) {
+    default:
+      llvm_unreachable("Unknown register space!");
+    case PTXRegisterSpace::Reg:
+      switch (RegType) {
+      default:
+        llvm_unreachable("Unknown register type!");
+      case PTXRegisterType::Pred:
+        OS << "p";
+        break;
+      case PTXRegisterType::B16:
+        OS << "rh";
+        break;
+      case PTXRegisterType::B32:
+        OS << "r";
+        break;
+      case PTXRegisterType::B64:
+        OS << "rd";
+        break;
+      case PTXRegisterType::F32:
+        OS << "f";
+        break;
+      case PTXRegisterType::F64:
+        OS << "fd";
+        break;
+      }
+      break;
+    case PTXRegisterSpace::Return:
+      OS << "ret";
+      break;
+    case PTXRegisterSpace::Argument:
+      OS << "arg";
+      break;
+    }
+
+    OS << RegOffset;
+  }
 } // namespace llvm
 
 #endif
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index bdf238b..77ed71d 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -51,23 +51,23 @@ using namespace llvm;
 static const char PARAM_PREFIX[] = "__param_";
 static const char RETURN_PREFIX[] = "__ret_";
 
-static const char *getRegisterTypeName(unsigned RegNo,
-                                       const MachineRegisterInfo& MRI) {
-  const TargetRegisterClass *TRC = MRI.getRegClass(RegNo);
-
-#define TEST_REGCLS(cls, clsstr) \
-  if (PTX::cls ## RegisterClass == TRC) return # clsstr;
-
-  TEST_REGCLS(RegPred, pred);
-  TEST_REGCLS(RegI16, b16);
-  TEST_REGCLS(RegI32, b32);
-  TEST_REGCLS(RegI64, b64);
-  TEST_REGCLS(RegF32, b32);
-  TEST_REGCLS(RegF64, b64);
-#undef TEST_REGCLS
-
-  llvm_unreachable("Not in any register class!");
-  return NULL;
+static const char *getRegisterTypeName(unsigned RegType) {
+  switch (RegType) {
+  default:
+    llvm_unreachable("Unknown register type");
+  case PTXRegisterType::Pred:
+    return ".pred";
+  case PTXRegisterType::B16:
+    return ".b16";
+  case PTXRegisterType::B32:
+    return ".b32";
+  case PTXRegisterType::B64:
+    return ".b64";
+  case PTXRegisterType::F32:
+    return ".f32";
+  case PTXRegisterType::F64:
+    return ".f64";
+  }
 }
 
 static const char *getStateSpaceName(unsigned addressSpace) {
@@ -188,32 +188,32 @@ void PTXAsmPrinter::EmitFunctionBodyStart() {
   unsigned numRegs;
 
   // pred
-  numRegs = MFI->getNumRegistersForClass(PTX::RegPredRegisterClass);
+  numRegs = MFI->countRegisters(PTXRegisterType::Pred, PTXRegisterSpace::Reg);
   if(numRegs > 0)
     os << "\t.reg .pred %p<" << numRegs << ">;\n";
 
   // i16
-  numRegs = MFI->getNumRegistersForClass(PTX::RegI16RegisterClass);
+  numRegs = MFI->countRegisters(PTXRegisterType::B16, PTXRegisterSpace::Reg);
   if(numRegs > 0)
     os << "\t.reg .b16 %rh<" << numRegs << ">;\n";
 
   // i32
-  numRegs = MFI->getNumRegistersForClass(PTX::RegI32RegisterClass);
+  numRegs = MFI->countRegisters(PTXRegisterType::B32, PTXRegisterSpace::Reg);
   if(numRegs > 0)
     os << "\t.reg .b32 %r<" << numRegs << ">;\n";
 
   // i64
-  numRegs = MFI->getNumRegistersForClass(PTX::RegI64RegisterClass);
+  numRegs = MFI->countRegisters(PTXRegisterType::B64, PTXRegisterSpace::Reg);
   if(numRegs > 0)
     os << "\t.reg .b64 %rd<" << numRegs << ">;\n";
 
   // f32
-  numRegs = MFI->getNumRegistersForClass(PTX::RegF32RegisterClass);
+  numRegs = MFI->countRegisters(PTXRegisterType::F32, PTXRegisterSpace::Reg);
   if(numRegs > 0)
     os << "\t.reg .f32 %f<" << numRegs << ">;\n";
 
   // f64
-  numRegs = MFI->getNumRegistersForClass(PTX::RegF64RegisterClass);
+  numRegs = MFI->countRegisters(PTXRegisterType::F64, PTXRegisterSpace::Reg);
   if(numRegs > 0)
     os << "\t.reg .f64 %fd<" << numRegs << ">;\n";
 
@@ -368,7 +368,6 @@ void PTXAsmPrinter::EmitFunctionEntryLabel() {
   const PTXParamManager &PM = MFI->getParamManager();
   const bool isKernel = MFI->isKernel();
   const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
-  const MachineRegisterInfo& MRI = MF->getRegInfo();
 
   SmallString<128> decl;
   raw_svector_ostream os(decl);
@@ -391,7 +390,7 @@ void PTXAsmPrinter::EmitFunctionEntryLabel() {
         if (i != b)
           os << ", ";
 
-        os << ".reg ." << getRegisterTypeName(*i, MRI) << ' '
+        os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
            << MFI->getRegisterName(*i);
       }
     }
@@ -450,7 +449,7 @@ void PTXAsmPrinter::EmitFunctionEntryLabel() {
       if (i != b)
         os << ", ";
 
-      os << ".reg ." << getRegisterTypeName(*i, MRI) << ' '
+      os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
          << MFI->getRegisterName(*i);
     }
   }
@@ -521,20 +520,18 @@ MCOperand PTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
 MCOperand PTXAsmPrinter::lowerOperand(const MachineOperand &MO) {
   MCOperand MCOp;
   const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-  const MCExpr *Expr;
-  const char *RegSymbolName;
+  unsigned EncodedReg;
   switch (MO.getType()) {
   default:
     llvm_unreachable("Unknown operand type");
   case MachineOperand::MO_Register:
-    // We create register operands as symbols, since the PTXInstPrinter class
-    // has no way to map virtual registers back to a name without some ugly
-    // hacks.
-    // FIXME: Figure out a better way to handle virtual register naming.
-    RegSymbolName = MFI->getRegisterName(MO.getReg());
-    Expr = MCSymbolRefExpr::Create(RegSymbolName, MCSymbolRefExpr::VK_None,
-                                   OutContext);
-    MCOp = MCOperand::CreateExpr(Expr);
+    if (MO.getReg() > 0) {
+      // Encode the register
+      EncodedReg = MFI->getEncodedRegister(MO.getReg());
+    } else {
+      EncodedReg = 0;
+    }
+    MCOp = MCOperand::CreateReg(EncodedReg);
     break;
   case MachineOperand::MO_Immediate:
     MCOp = MCOperand::CreateImm(MO.getImm());
diff --git a/lib/Target/PTX/PTXFPRoundingModePass.cpp b/lib/Target/PTX/PTXFPRoundingModePass.cpp
index 0b653e0..a21d172 100644
--- a/lib/Target/PTX/PTXFPRoundingModePass.cpp
+++ b/lib/Target/PTX/PTXFPRoundingModePass.cpp
@@ -23,9 +23,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+
 // NOTE: PTXFPRoundingModePass should be executed just before emission.
 
-namespace llvm {
+namespace {
   /// PTXFPRoundingModePass - Pass to assign appropriate FP rounding modes to
   /// all FP instructions. Essentially, this pass just looks for all FP
   /// instructions that have a rounding mode set to RndDefault, and sets an
@@ -58,7 +60,7 @@ namespace llvm {
       void initializeMap();
       void processInstruction(MachineInstr &MI);
   }; // class PTXFPRoundingModePass
-} // namespace llvm
+} // end anonymous namespace
 
 using namespace llvm;
 
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index 17191fb..a012297 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -243,6 +243,30 @@ SDValue PTXTargetLowering::
     for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
       EVT                  RegVT = Ins[i].VT;
       TargetRegisterClass* TRC   = getRegClassFor(RegVT);
+      unsigned             RegType;
+
+      // Determine which register class we need
+      if (RegVT == MVT::i1) {
+        RegType = PTXRegisterType::Pred;
+      }
+      else if (RegVT == MVT::i16) {
+        RegType = PTXRegisterType::B16;
+      }
+      else if (RegVT == MVT::i32) {
+        RegType = PTXRegisterType::B32;
+      }
+      else if (RegVT == MVT::i64) {
+        RegType = PTXRegisterType::B64;
+      }
+      else if (RegVT == MVT::f32) {
+        RegType = PTXRegisterType::F32;
+      }
+      else if (RegVT == MVT::f64) {
+        RegType = PTXRegisterType::F64;
+      }
+      else {
+        llvm_unreachable("Unknown parameter type");
+      }
 
       // Use a unique index in the instruction to prevent instruction folding.
       // Yes, this is a hack.
@@ -253,7 +277,7 @@ SDValue PTXTargetLowering::
 
       InVals.push_back(ArgValue);
 
-      MFI->addArgReg(Reg);
+      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Argument);
     }
   }
 
@@ -304,25 +328,32 @@ SDValue PTXTargetLowering::
     for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
       EVT                  RegVT = Outs[i].VT;
       TargetRegisterClass* TRC = 0;
+      unsigned             RegType;
 
       // Determine which register class we need
       if (RegVT == MVT::i1) {
         TRC = PTX::RegPredRegisterClass;
+        RegType = PTXRegisterType::Pred;
       }
       else if (RegVT == MVT::i16) {
         TRC = PTX::RegI16RegisterClass;
+        RegType = PTXRegisterType::B16;
       }
       else if (RegVT == MVT::i32) {
         TRC = PTX::RegI32RegisterClass;
+        RegType = PTXRegisterType::B32;
       }
       else if (RegVT == MVT::i64) {
         TRC = PTX::RegI64RegisterClass;
+        RegType = PTXRegisterType::B64;
       }
       else if (RegVT == MVT::f32) {
         TRC = PTX::RegF32RegisterClass;
+        RegType = PTXRegisterType::F32;
       }
       else if (RegVT == MVT::f64) {
         TRC = PTX::RegF64RegisterClass;
+        RegType = PTXRegisterType::F64;
       }
       else {
         llvm_unreachable("Unknown parameter type");
@@ -335,7 +366,7 @@ SDValue PTXTargetLowering::
 
       Chain = DAG.getNode(PTXISD::WRITE_PARAM, dl, MVT::Other, Copy, OutReg);
 
-      MFI->addRetReg(Reg);
+      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Return);
     }
   }
 
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index 1b947a5..871b3a7 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -116,7 +116,7 @@ bool PTXInstrInfo::isPredicated(const MachineInstr *MI) const {
 }
 
 bool PTXInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  return !isPredicated(MI) && get(MI->getOpcode()).isTerminator();
+  return !isPredicated(MI) && MI->isTerminator();
 }
 
 bool PTXInstrInfo::
@@ -184,15 +184,13 @@ AnalyzeBranch(MachineBasicBlock &MBB,
   if (MBB.empty())
     return true;
 
-  MachineBasicBlock::const_iterator iter = MBB.end();
+  MachineBasicBlock::iterator iter = MBB.end();
   const MachineInstr& instLast1 = *--iter;
-  const MCInstrDesc &desc1 = instLast1.getDesc();
   // for special case that MBB has only 1 instruction
   const bool IsSizeOne = MBB.size() == 1;
   // if IsSizeOne is true, *--iter and instLast2 are invalid
   // we put a dummy value in instLast2 and desc2 since they are used
   const MachineInstr& instLast2 = IsSizeOne ? instLast1 : *--iter;
-  const MCInstrDesc &desc2 = IsSizeOne ? desc1 : instLast2.getDesc();
 
   DEBUG(dbgs() << "\n");
   DEBUG(dbgs() << "AnalyzeBranch: opcode: " << instLast1.getOpcode() << "\n");
@@ -207,7 +205,7 @@ AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // this block ends with only an unconditional branch
-  if (desc1.isUnconditionalBranch() &&
+  if (instLast1.isUnconditionalBranch() &&
       // when IsSizeOne is true, it "absorbs" the evaluation of instLast2
       (IsSizeOne || !IsAnyKindOfBranch(instLast2))) {
     DEBUG(dbgs() << "AnalyzeBranch: ends with only uncond branch\n");
@@ -217,7 +215,7 @@ AnalyzeBranch(MachineBasicBlock &MBB,
 
   // this block ends with a conditional branch and
   // it falls through to a successor block
-  if (desc1.isConditionalBranch() &&
+  if (instLast1.isConditionalBranch() &&
       IsAnySuccessorAlsoLayoutSuccessor(MBB)) {
     DEBUG(dbgs() << "AnalyzeBranch: ends with cond branch and fall through\n");
     TBB = GetBranchTarget(instLast1);
@@ -233,8 +231,8 @@ AnalyzeBranch(MachineBasicBlock &MBB,
 
   // this block ends with a conditional branch
   // followed by an unconditional branch
-  if (desc2.isConditionalBranch() &&
-      desc1.isUnconditionalBranch()) {
+  if (instLast2.isConditionalBranch() &&
+      instLast1.isUnconditionalBranch()) {
     DEBUG(dbgs() << "AnalyzeBranch: ends with cond and uncond branch\n");
     TBB = GetBranchTarget(instLast2);
     FBB = GetBranchTarget(instLast1);
@@ -341,8 +339,7 @@ void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) {
 }
 
 bool PTXInstrInfo::IsAnyKindOfBranch(const MachineInstr& inst) {
-  const MCInstrDesc &desc = inst.getDesc();
-  return desc.isTerminator() || desc.isBranch() || desc.isIndirectBranch();
+  return inst.isTerminator() || inst.isBranch() || inst.isIndirectBranch();
 }
 
 bool PTXInstrInfo::
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index bcd5bcf..19a862f 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -825,17 +825,17 @@ let hasSideEffects = 1 in {
 ///===- Parameter Passing Pseudo-Instructions -----------------------------===//
 
 def READPARAMPRED : InstPTX<(outs RegPred:$a), (ins i32imm:$b),
-                            "mov.pred\t$a, %param$b", []>;
+                            "mov.pred\t$a, %arg$b", []>;
 def READPARAMI16  : InstPTX<(outs RegI16:$a), (ins i32imm:$b),
-                            "mov.b16\t$a, %param$b", []>;
+                            "mov.b16\t$a, %arg$b", []>;
 def READPARAMI32  : InstPTX<(outs RegI32:$a), (ins i32imm:$b),
-                            "mov.b32\t$a, %param$b", []>;
+                            "mov.b32\t$a, %arg$b", []>;
 def READPARAMI64  : InstPTX<(outs RegI64:$a), (ins i32imm:$b),
-                            "mov.b64\t$a, %param$b", []>;
+                            "mov.b64\t$a, %arg$b", []>;
 def READPARAMF32  : InstPTX<(outs RegF32:$a), (ins i32imm:$b),
-                            "mov.f32\t$a, %param$b", []>;
+                            "mov.f32\t$a, %arg$b", []>;
 def READPARAMF64  : InstPTX<(outs RegF64:$a), (ins i32imm:$b),
-                            "mov.f64\t$a, %param$b", []>;
+                            "mov.f64\t$a, %arg$b", []>;
 
 def WRITEPARAMPRED : InstPTX<(outs), (ins RegPred:$a), "//w", []>;
 def WRITEPARAMI16  : InstPTX<(outs), (ins RegI16:$a), "//w", []>;
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
index b33a273..26ec623 100644
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ b/lib/Target/PTX/PTXMFInfoExtract.cpp
@@ -22,9 +22,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+
 // NOTE: PTXMFInfoExtract must after register allocation!
 
-namespace llvm {
+namespace {
   /// PTXMFInfoExtract - PTX specific code to extract of PTX machine
   /// function information for PTXAsmPrinter
   ///
@@ -42,7 +44,7 @@ namespace llvm {
         return "PTX Machine Function Info Extractor";
       }
   }; // class PTXMFInfoExtract
-} // namespace llvm
+} // end anonymous namespace
 
 using namespace llvm;
 
@@ -56,7 +58,20 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
   for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
-    MFI->addVirtualRegister(TRC, Reg);
+    unsigned RegType;
+    if (TRC == PTX::RegPredRegisterClass)
+      RegType = PTXRegisterType::Pred;
+    else if (TRC == PTX::RegI16RegisterClass)
+      RegType = PTXRegisterType::B16;
+    else if (TRC == PTX::RegI32RegisterClass)
+      RegType = PTXRegisterType::B32;
+    else if (TRC == PTX::RegI64RegisterClass)
+      RegType = PTXRegisterType::B64;
+    else if (TRC == PTX::RegF32RegisterClass)
+      RegType = PTXRegisterType::F32;
+    else if (TRC == PTX::RegF64RegisterClass)
+      RegType = PTXRegisterType::F64;
+    MFI->addRegister(Reg, RegType, PTXRegisterSpace::Reg);
   }
 
   return false;
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h
index 3b985f7..1a2878c 100644
--- a/lib/Target/PTX/PTXMachineFunctionInfo.h
+++ b/lib/Target/PTX/PTXMachineFunctionInfo.h
@@ -35,15 +35,22 @@ private:
   DenseSet<unsigned> RegArgs;
   DenseSet<unsigned> RegRets;
 
-  typedef std::vector<unsigned> RegisterList;
-  typedef DenseMap<const TargetRegisterClass*, RegisterList> RegisterMap;
-  typedef DenseMap<unsigned, std::string> RegisterNameMap;
   typedef DenseMap<int, std::string> FrameMap;
 
-  RegisterMap UsedRegs;
-  RegisterNameMap RegNames;
   FrameMap FrameSymbols;
 
+  struct RegisterInfo {
+    unsigned Reg;
+    unsigned Type;
+    unsigned Space;
+    unsigned Offset;
+    unsigned Encoded;
+  };
+
+  typedef DenseMap<unsigned, RegisterInfo> RegisterInfoMap;
+
+  RegisterInfoMap RegInfo;
+
   PTXParamManager ParamManager;
 
 public:
@@ -51,13 +58,7 @@ public:
 
   PTXMachineFunctionInfo(MachineFunction &MF)
     : IsKernel(false) {
-      UsedRegs[PTX::RegPredRegisterClass] = RegisterList();
-      UsedRegs[PTX::RegI16RegisterClass] = RegisterList();
-      UsedRegs[PTX::RegI32RegisterClass] = RegisterList();
-      UsedRegs[PTX::RegI64RegisterClass] = RegisterList();
-      UsedRegs[PTX::RegF32RegisterClass] = RegisterList();
-      UsedRegs[PTX::RegF64RegisterClass] = RegisterList();
-    }
+  }
 
   /// getParamManager - Returns the PTXParamManager instance for this function.
   PTXParamManager& getParamManager() { return ParamManager; }
@@ -78,69 +79,106 @@ public:
   reg_iterator retreg_begin() const { return RegRets.begin(); }
   reg_iterator retreg_end()   const { return RegRets.end(); }
 
+  /// addRegister - Adds a virtual register to the set of all used registers
+  void addRegister(unsigned Reg, unsigned RegType, unsigned RegSpace) {
+    if (!RegInfo.count(Reg)) {
+      RegisterInfo Info;
+      Info.Reg = Reg;
+      Info.Type = RegType;
+      Info.Space = RegSpace;
+
+      // Determine register offset
+      Info.Offset = 0;
+      for(RegisterInfoMap::const_iterator i = RegInfo.begin(),
+          e = RegInfo.end(); i != e; ++i) {
+        const RegisterInfo& RI = i->second;
+        if (RI.Space == RegSpace)
+          if (RI.Space != PTXRegisterSpace::Reg || RI.Type == Info.Type)
+            Info.Offset++;
+      }
+
+      // Encode the register data into a single register number
+      Info.Encoded = (Info.Offset << 6) | (Info.Type << 3) | Info.Space;
+
+      RegInfo[Reg] = Info;
+
+      if (RegSpace == PTXRegisterSpace::Argument)
+        RegArgs.insert(Reg);
+      else if (RegSpace == PTXRegisterSpace::Return)
+        RegRets.insert(Reg);
+    }
+  }
+
+  /// countRegisters - Returns the number of registers of the given type and
+  /// space.
+  unsigned countRegisters(unsigned RegType, unsigned RegSpace) const {
+    unsigned Count = 0;
+    for(RegisterInfoMap::const_iterator i = RegInfo.begin(), e = RegInfo.end();
+        i != e; ++i) {
+      const RegisterInfo& RI = i->second;
+      if (RI.Type == RegType && RI.Space == RegSpace)
+        Count++;
+    }
+    return Count;
+  }
+
+  /// getEncodedRegister - Returns the encoded value of the register.
+  unsigned getEncodedRegister(unsigned Reg) const {
+    return RegInfo.lookup(Reg).Encoded;
+  }
+
   /// addRetReg - Adds a register to the set of return-value registers.
   void addRetReg(unsigned Reg) {
     if (!RegRets.count(Reg)) {
       RegRets.insert(Reg);
-      std::string name;
-      name = "%ret";
-      name += utostr(RegRets.size() - 1);
-      RegNames[Reg] = name;
     }
   }
 
   /// addArgReg - Adds a register to the set of function argument registers.
   void addArgReg(unsigned Reg) {
     RegArgs.insert(Reg);
-    std::string name;
-    name = "%param";
-    name += utostr(RegArgs.size() - 1);
-    RegNames[Reg] = name;
-  }
-
-  /// addVirtualRegister - Adds a virtual register to the set of all used
-  /// registers in the function.
-  void addVirtualRegister(const TargetRegisterClass *TRC, unsigned Reg) {
-    std::string name;
-
-    // Do not count registers that are argument/return registers.
-    if (!RegRets.count(Reg) && !RegArgs.count(Reg)) {
-      UsedRegs[TRC].push_back(Reg);
-      if (TRC == PTX::RegPredRegisterClass)
-        name = "%p";
-      else if (TRC == PTX::RegI16RegisterClass)
-        name = "%rh";
-      else if (TRC == PTX::RegI32RegisterClass)
-        name = "%r";
-      else if (TRC == PTX::RegI64RegisterClass)
-        name = "%rd";
-      else if (TRC == PTX::RegF32RegisterClass)
-        name = "%f";
-      else if (TRC == PTX::RegF64RegisterClass)
-        name = "%fd";
-      else
-        llvm_unreachable("Invalid register class");
-
-      name += utostr(UsedRegs[TRC].size() - 1);
-      RegNames[Reg] = name;
-    }
   }
 
   /// getRegisterName - Returns the name of the specified virtual register. This
   /// name is used during PTX emission.
-  const char *getRegisterName(unsigned Reg) const {
-    if (RegNames.count(Reg))
-      return RegNames.find(Reg)->second.c_str();
+  std::string getRegisterName(unsigned Reg) const {
+    if (RegInfo.count(Reg)) {
+      const RegisterInfo& RI = RegInfo.lookup(Reg);
+      std::string Name;
+      raw_string_ostream NameStr(Name);
+      decodeRegisterName(NameStr, RI.Encoded);
+      NameStr.flush();
+      return Name;
+    }
     else if (Reg == PTX::NoRegister)
       return "%noreg";
     else
       llvm_unreachable("Register not in register name map");
   }
 
-  /// getNumRegistersForClass - Returns the number of virtual registers that are
-  /// used for the specified register class.
-  unsigned getNumRegistersForClass(const TargetRegisterClass *TRC) const {
-    return UsedRegs.lookup(TRC).size();
+  /// getEncodedRegisterName - Returns the name of the encoded register.
+  std::string getEncodedRegisterName(unsigned EncodedReg) const {
+    std::string Name;
+    raw_string_ostream NameStr(Name);
+    decodeRegisterName(NameStr, EncodedReg);
+    NameStr.flush();
+    return Name;
+  }
+
+  /// getRegisterType - Returns the type of the specified virtual register.
+  unsigned getRegisterType(unsigned Reg) const {
+    if (RegInfo.count(Reg))
+      return RegInfo.lookup(Reg).Type;
+    else
+      llvm_unreachable("Unknown register");
+  }
+
+  /// getOffsetForRegister - Returns the offset of the virtual register
+  unsigned getOffsetForRegister(unsigned Reg) const {
+    if (RegInfo.count(Reg))
+      return RegInfo.lookup(Reg).Offset;
+    else
+      return 0;
   }
 
   /// getFrameSymbol - Returns the symbol name for the given FrameIndex.
@@ -148,13 +186,13 @@ public:
     if (FrameSymbols.count(FrameIndex)) {
       return FrameSymbols.lookup(FrameIndex).c_str();
     } else {
-      std::string Name = "__local";
-      Name += utostr(FrameIndex);
+      std::string Name          = "__local";
+      Name                     += utostr(FrameIndex);
       // The whole point of caching this name is to ensure the pointer we pass
       // to any getExternalSymbol() calls will remain valid for the lifetime of
       // the back-end instance. This is to work around an issue in SelectionDAG
       // where symbol names are expected to be life-long strings.
-      FrameSymbols[FrameIndex] = Name;
+      FrameSymbols[FrameIndex]  = Name;
       return FrameSymbols[FrameIndex].c_str();
     }
   }
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index 292ea5e..4efdc27 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -67,30 +67,16 @@ namespace {
     "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
   const char* DataLayout64 =
     "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
-
-  // Copied from LLVMTargetMachine.cpp
-  void printNoVerify(PassManagerBase &PM, const char *Banner) {
-    if (PrintMachineCode)
-      PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
-  }
-
-  void printAndVerify(PassManagerBase &PM,
-                      const char *Banner) {
-    if (PrintMachineCode)
-      PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
-
-    //if (VerifyMachineCode)
-    //  PM.add(createMachineVerifierPass(Banner));
-  }
 }
 
 // DataLayout and FrameLowering are filled with dummy data
 PTXTargetMachine::PTXTargetMachine(const Target &T,
                                    StringRef TT, StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     DataLayout(is64Bit ? DataLayout64 : DataLayout32),
     Subtarget(TT, CPU, FS, is64Bit),
     FrameLowering(Subtarget),
@@ -101,16 +87,18 @@ PTXTargetMachine::PTXTargetMachine(const Target &T,
 
 PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {
+  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
 }
 
 PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {
+  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
 }
 
 bool PTXTargetMachine::addInstSelector(PassManagerBase &PM) {
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
index 19f6c0f..22911f7 100644
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ b/lib/Target/PTX/PTXTargetMachine.h
@@ -35,7 +35,7 @@ class PTXTargetMachine : public LLVMTargetMachine {
 
   public:
     PTXTargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL,
                      bool is64Bit);
@@ -94,7 +94,7 @@ class PTX32TargetMachine : public PTXTargetMachine {
 public:
 
   PTX32TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 }; // class PTX32TargetMachine
@@ -103,7 +103,7 @@ class PTX64TargetMachine : public PTXTargetMachine {
 public:
 
   PTX64TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 }; // class PTX32TargetMachine
diff --git a/lib/Target/PTX/TargetInfo/CMakeLists.txt b/lib/Target/PTX/TargetInfo/CMakeLists.txt
index 2366e45..d9a5da3 100644
--- a/lib/Target/PTX/TargetInfo/CMakeLists.txt
+++ b/lib/Target/PTX/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMPTXInfo
   PTXTargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMPTXInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMPTXInfo PTXCommonTableGen)
diff --git a/lib/Target/PTX/TargetInfo/LLVMBuild.txt b/lib/Target/PTX/TargetInfo/LLVMBuild.txt
index 8e5285a..2cc30c4 100644
--- a/lib/Target/PTX/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/PTX/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = PTXInfo
 parent = PTX
 required_libraries = MC Support Target
 add_to_library_groups = PTX
-
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 05c1ffd..1b85495 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -27,20 +27,6 @@ add_llvm_target(PowerPCCodeGen
   PPCSelectionDAGInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMPowerPCCodeGen
-  LLVMAnalysis
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMPowerPCAsmPrinter
-  LLVMPowerPCDesc
-  LLVMPowerPCInfo
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
index 1d857e2..a605cc4 100644
--- a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
+++ b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
@@ -4,9 +4,4 @@ add_llvm_library(LLVMPowerPCAsmPrinter
   PPCInstPrinter.cpp
   )
 
-add_llvm_library_dependencies(LLVMPowerPCAsmPrinter
-  LLVMMC
-  LLVMSupport
-  )
-
 add_dependencies(LLVMPowerPCAsmPrinter PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/InstPrinter/LLVMBuild.txt b/lib/Target/PowerPC/InstPrinter/LLVMBuild.txt
index afbb2b1..7c691de 100644
--- a/lib/Target/PowerPC/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/PowerPC/InstPrinter/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = PowerPCAsmPrinter
 parent = PowerPC
 required_libraries = MC Support
 add_to_library_groups = PowerPC
-
diff --git a/lib/Target/PowerPC/LLVMBuild.txt b/lib/Target/PowerPC/LLVMBuild.txt
index 5baa988..95fac54 100644
--- a/lib/Target/PowerPC/LLVMBuild.txt
+++ b/lib/Target/PowerPC/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = InstPrinter MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = PowerPC
@@ -28,4 +31,3 @@ name = PowerPCCodeGen
 parent = PowerPC
 required_libraries = Analysis AsmPrinter CodeGen Core MC PowerPCAsmPrinter PowerPCDesc PowerPCInfo SelectionDAG Support Target
 add_to_library_groups = PowerPC
-
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index c4041db..febf438 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -6,11 +6,4 @@ add_llvm_library(LLVMPowerPCDesc
   PPCPredicates.cpp
   )
 
-add_llvm_library_dependencies(LLVMPowerPCDesc
-  LLVMMC
-  LLVMPowerPCAsmPrinter
-  LLVMPowerPCInfo
-  LLVMSupport
-  )
-
 add_dependencies(LLVMPowerPCDesc PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/MCTargetDesc/LLVMBuild.txt b/lib/Target/PowerPC/MCTargetDesc/LLVMBuild.txt
index fc2da83..d3a567d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = PowerPCDesc
 parent = PowerPC
 required_libraries = MC PowerPCAsmPrinter PowerPCInfo Support
 add_to_library_groups = PowerPC
-
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 9f2fd6d..34a5774 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -93,6 +93,16 @@ public:
     // FIXME.
     return false;
   }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCInstFragment *DF,
+                            const MCAsmLayout &Layout) const {
+    // FIXME.
+    assert(0 && "RelaxInstruction() unimplemented");
+    return false;
+  }
+
   
   void RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
     // FIXME.
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 56f622e..5dc2d3d 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -365,11 +365,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
       
   case PPC::MFCRpseud:
+  case PPC::MFCR8pseud:
     // Transform: %R3 = MFCRpseud %CR7
     // Into:      %R3 = MFCR      ;; cr7
     OutStreamer.AddComment(PPCInstPrinter::
                            getRegisterName(MI->getOperand(1).getReg()));
-    TmpInst.setOpcode(PPC::MFCR);
+    TmpInst.setOpcode(Subtarget.isPPC64() ? PPC::MFCR8 : PPC::MFCR);
     TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
     OutStreamer.EmitInstruction(TmpInst);
     return;
@@ -441,7 +442,7 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     Directive = PPC::DIR_970;
   if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
     Directive = PPC::DIR_7400;
-  if (Subtarget.isPPC64() && Directive < PPC::DIR_970)
+  if (Subtarget.isPPC64() && Directive < PPC::DIR_64)
     Directive = PPC::DIR_64;
   assert(Directive <= PPC::DIR_64 && "Directive out of range.");
   
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 4a1f182..9d2f4d0 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -138,7 +138,8 @@ void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
 unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI,
                                              unsigned OpNo) const {
   const MachineOperand &MO = MI.getOperand(OpNo);
-  assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
+  assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MTCRF8 ||
+            MI.getOpcode() == PPC::MFOCRF) &&
          (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
   return 0x80 >> getPPCRegisterNumbering(MO.getReg());
 }
@@ -248,7 +249,8 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
   if (MO.isReg()) {
     // MTCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
     // The GPR operand should come through here though.
-    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
+    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MTCRF8 &&
+             MI.getOpcode() != PPC::MFOCRF) ||
            MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
     return getPPCRegisterNumbering(MO.getReg());
   }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 0b85fea..5c45018 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -64,7 +64,7 @@ static void RemoveVRSaveCode(MachineInstr *MI) {
   // epilog blocks.
   for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
     // If last instruction is a return instruction, add an epilogue
-    if (!I->empty() && I->back().getDesc().isReturn()) {
+    if (!I->empty() && I->back().isReturn()) {
       bool FoundIt = false;
       for (MBBI = I->end(); MBBI != I->begin(); ) {
         --MBBI;
@@ -244,8 +244,10 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
   if (MF.getFunction()->hasFnAttr(Attribute::Naked))
     return false;
 
-  return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects() ||
-    (GuaranteedTailCallOpt && MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MFI->hasVarSizedObjects() ||
+    (MF.getTarget().Options.GuaranteedTailCallOpt &&
+     MF.getInfo<PPCFunctionInfo>()->hasFastCall());
 }
 
 
@@ -655,7 +657,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Callee pop calling convention. Pop parameter/linkage area. Used for tail
   // call optimization
-  if (GuaranteedTailCallOpt && RetOpcode == PPC::BLR &&
+  if (MF.getTarget().Options.GuaranteedTailCallOpt && RetOpcode == PPC::BLR &&
       MF.getFunction()->getCallingConv() == CallingConv::Fast) {
      PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
      unsigned CallerAllocatedAmt = FI->getMinReservedArea();
@@ -758,7 +760,8 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Reserve stack space to move the linkage area to in case of a tail call.
   int TCSPDelta = 0;
-  if (GuaranteedTailCallOpt && (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
     MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
   }
 
@@ -769,7 +772,7 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // FIXME: doesn't detect whether or not we need to spill vXX, which requires
   //        r0 for now.
 
-  if (RegInfo->requiresRegisterScavenging(MF)) // FIXME (64-bit): Enable.
+  if (RegInfo->requiresRegisterScavenging(MF))
     if (needsFP(MF) || spillsCR(MF)) {
       const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
       const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
@@ -863,7 +866,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
 
   // Take into account stack space reserved for tail calls.
   int TCSPDelta = 0;
-  if (GuaranteedTailCallOpt && (TCSPDelta = PFI->getTailCallSPDelta()) < 0) {
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = PFI->getTailCallSPDelta()) < 0) {
     LowerBound = TCSPDelta;
   }
 
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 3197fc8..ae317af 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -27,7 +27,6 @@ void PPCHazardRecognizer440::EmitInstruction(SUnit *SU) {
   const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
   if (!MCID) {
     // This is a PPC pseudo-instruction.
-    // FIXME: Should something else be done?
     return;
   }
 
@@ -62,6 +61,7 @@ void PPCHazardRecognizer440::EmitInstruction(SUnit *SU) {
 
 PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
   : TII(tii) {
+  LastWasBL8_ELF = false;
   EndDispatchGroup();
 }
 
@@ -80,12 +80,6 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
                                      bool &isFirst, bool &isSingle,
                                      bool &isCracked,
                                      bool &isLoad, bool &isStore) {
-  if ((int)Opcode >= 0) {
-    isFirst = isSingle = isCracked = isLoad = isStore = false;
-    return PPCII::PPC970_Pseudo;
-  }
-  Opcode = ~Opcode;
-
   const MCInstrDesc &MCID = TII.get(Opcode);
 
   isLoad  = MCID.mayLoad();
@@ -102,29 +96,23 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
 /// isLoadOfStoredAddress - If we have a load from the previously stored pointer
 /// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
 bool PPCHazardRecognizer970::
-isLoadOfStoredAddress(unsigned LoadSize, SDValue Ptr1, SDValue Ptr2) const {
+isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+  const Value *LoadValue) const {
   for (unsigned i = 0, e = NumStores; i != e; ++i) {
     // Handle exact and commuted addresses.
-    if (Ptr1 == StorePtr1[i] && Ptr2 == StorePtr2[i])
-      return true;
-    if (Ptr2 == StorePtr1[i] && Ptr1 == StorePtr2[i])
+    if (LoadValue == StoreValue[i] && LoadOffset == StoreOffset[i])
       return true;
 
     // Okay, we don't have an exact match, if this is an indexed offset, see if
     // we have overlap (which happens during fp->int conversion for example).
-    if (StorePtr2[i] == Ptr2) {
-      if (ConstantSDNode *StoreOffset = dyn_cast<ConstantSDNode>(StorePtr1[i]))
-        if (ConstantSDNode *LoadOffset = dyn_cast<ConstantSDNode>(Ptr1)) {
-          // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
-          // to see if the load and store actually overlap.
-          int StoreOffs = StoreOffset->getZExtValue();
-          int LoadOffs  = LoadOffset->getZExtValue();
-          if (StoreOffs < LoadOffs) {
-            if (int(StoreOffs+StoreSize[i]) > LoadOffs) return true;
-          } else {
-            if (int(LoadOffs+LoadSize) > StoreOffs) return true;
-          }
-        }
+    if (StoreValue[i] == LoadValue) {
+      // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
+      // to see if the load and store actually overlap.
+      if (StoreOffset[i] < LoadOffset) {
+        if (int64_t(StoreOffset[i]+StoreSize[i]) > LoadOffset) return true;
+      } else {
+        if (int64_t(LoadOffset+LoadSize) > StoreOffset[i]) return true;
+      }
     }
   }
   return false;
@@ -138,13 +126,26 @@ ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970::
 getHazardType(SUnit *SU, int Stalls) {
   assert(Stalls == 0 && "PPC hazards don't support scoreboard lookahead");
 
-  const SDNode *Node = SU->getNode()->getGluedMachineNode();
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return NoHazard;
+
+  unsigned Opcode = MI->getOpcode();
+
+  // If the last instruction was a BL8_ELF, then the NOP must follow it
+  // directly (this is strong requirement from the linker due to the ELF ABI).
+  // We return only Hazard (and not NoopHazard) because if the NOP is necessary
+  // then it will already be in the instruction stream (it is not always
+  // necessary; tail calls, for example, do not need it).
+  if (LastWasBL8_ELF && Opcode != PPC::NOP)
+    return Hazard;
+
   bool isFirst, isSingle, isCracked, isLoad, isStore;
   PPCII::PPC970_Unit InstrType =
-    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
                  isLoad, isStore);
   if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;
-  unsigned Opcode = Node->getMachineOpcode();
 
   // We can only issue a PPC970_First/PPC970_Single instruction (such as
   // crand/mtspr/etc) if this is the first cycle of the dispatch group.
@@ -181,55 +182,10 @@ getHazardType(SUnit *SU, int Stalls) {
 
   // If this is a load following a store, make sure it's not to the same or
   // overlapping address.
-  if (isLoad && NumStores) {
-    unsigned LoadSize;
-    switch (Opcode) {
-    default: llvm_unreachable("Unknown load!");
-    case PPC::LBZ:   case PPC::LBZU:
-    case PPC::LBZX:
-    case PPC::LBZ8:  case PPC::LBZU8:
-    case PPC::LBZX8:
-    case PPC::LVEBX:
-      LoadSize = 1;
-      break;
-    case PPC::LHA:   case PPC::LHAU:
-    case PPC::LHAX:
-    case PPC::LHZ:   case PPC::LHZU:
-    case PPC::LHZX:
-    case PPC::LVEHX:
-    case PPC::LHBRX:
-    case PPC::LHA8:   case PPC::LHAU8:
-    case PPC::LHAX8:
-    case PPC::LHZ8:   case PPC::LHZU8:
-    case PPC::LHZX8:
-      LoadSize = 2;
-      break;
-    case PPC::LFS:    case PPC::LFSU:
-    case PPC::LFSX:
-    case PPC::LWZ:    case PPC::LWZU:
-    case PPC::LWZX:
-    case PPC::LWA:
-    case PPC::LWAX:
-    case PPC::LVEWX:
-    case PPC::LWBRX:
-    case PPC::LWZ8:
-    case PPC::LWZX8:
-      LoadSize = 4;
-      break;
-    case PPC::LFD:    case PPC::LFDU:
-    case PPC::LFDX:
-    case PPC::LD:     case PPC::LDU:
-    case PPC::LDX:
-      LoadSize = 8;
-      break;
-    case PPC::LVX:
-    case PPC::LVXL:
-      LoadSize = 16;
-      break;
-    }
-
-    if (isLoadOfStoredAddress(LoadSize,
-                              Node->getOperand(0), Node->getOperand(1)))
+  if (isLoad && NumStores && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    if (isLoadOfStoredAddress(MO->getSize(),
+                              MO->getOffset(), MO->getValue()))
       return NoopHazard;
   }
 
@@ -237,66 +193,29 @@ getHazardType(SUnit *SU, int Stalls) {
 }
 
 void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
-  const SDNode *Node = SU->getNode()->getGluedMachineNode();
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return;
+
+  unsigned Opcode = MI->getOpcode();
+  LastWasBL8_ELF = (Opcode == PPC::BL8_ELF);
+
   bool isFirst, isSingle, isCracked, isLoad, isStore;
   PPCII::PPC970_Unit InstrType =
-    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
                  isLoad, isStore);
   if (InstrType == PPCII::PPC970_Pseudo) return;
-  unsigned Opcode = Node->getMachineOpcode();
 
   // Update structural hazard information.
   if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
 
   // Track the address stored to.
-  if (isStore) {
-    unsigned ThisStoreSize;
-    switch (Opcode) {
-    default: llvm_unreachable("Unknown store instruction!");
-    case PPC::STB:    case PPC::STB8:
-    case PPC::STBU:   case PPC::STBU8:
-    case PPC::STBX:   case PPC::STBX8:
-    case PPC::STVEBX:
-      ThisStoreSize = 1;
-      break;
-    case PPC::STH:    case PPC::STH8:
-    case PPC::STHU:   case PPC::STHU8:
-    case PPC::STHX:   case PPC::STHX8:
-    case PPC::STVEHX:
-    case PPC::STHBRX:
-      ThisStoreSize = 2;
-      break;
-    case PPC::STFS:
-    case PPC::STFSU:
-    case PPC::STFSX:
-    case PPC::STWX:   case PPC::STWX8:
-    case PPC::STWUX:
-    case PPC::STW:    case PPC::STW8:
-    case PPC::STWU:
-    case PPC::STVEWX:
-    case PPC::STFIWX:
-    case PPC::STWBRX:
-      ThisStoreSize = 4;
-      break;
-    case PPC::STD_32:
-    case PPC::STDX_32:
-    case PPC::STD:
-    case PPC::STDU:
-    case PPC::STFD:
-    case PPC::STFDX:
-    case PPC::STDX:
-    case PPC::STDUX:
-      ThisStoreSize = 8;
-      break;
-    case PPC::STVX:
-    case PPC::STVXL:
-      ThisStoreSize = 16;
-      break;
-    }
-
-    StoreSize[NumStores] = ThisStoreSize;
-    StorePtr1[NumStores] = Node->getOperand(1);
-    StorePtr2[NumStores] = Node->getOperand(2);
+  if (isStore && NumStores < 4 && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    StoreSize[NumStores] = MO->getSize();
+    StoreOffset[NumStores] = MO->getOffset();
+    StoreValue[NumStores] = MO->getValue();
     ++NumStores;
   }
 
@@ -319,3 +238,9 @@ void PPCHazardRecognizer970::AdvanceCycle() {
   if (NumIssued == 5)
     EndDispatchGroup();
 }
+
+void PPCHazardRecognizer970::Reset() {
+  LastWasBL8_ELF = false;
+  EndDispatchGroup();
+}
+
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 32fac91..95d0d64 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -49,14 +49,18 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
   // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
   bool HasCTRSet;
 
+  // Was the last instruction issued a BL8_ELF
+  bool LastWasBL8_ELF;
+
   // StoredPtr - Keep track of the address of any store.  If we see a load from
   // the same address (or one that aliases it), disallow the store.  We can have
   // up to four stores in one dispatch group, hence we track up to 4.
   //
   // This is null if we haven't seen a store yet.  We keep track of both
   // operands of the store here, since we support [r+r] and [r+i] addressing.
-  SDValue StorePtr1[4], StorePtr2[4];
-  unsigned  StoreSize[4];
+  const Value *StoreValue[4];
+  int64_t StoreOffset[4];
+  uint64_t StoreSize[4];
   unsigned NumStores;
 
 public:
@@ -64,6 +68,7 @@ public:
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void EmitInstruction(SUnit *SU);
   virtual void AdvanceCycle();
+  virtual void Reset();
 
 private:
   /// EndDispatchGroup - Called when we are finishing a new dispatch group.
@@ -76,8 +81,8 @@ private:
                                   bool &isFirst, bool &isSingle,bool &isCracked,
                                   bool &isLoad, bool &isStore);
 
-  bool isLoadOfStoredAddress(unsigned LoadSize,
-                             SDValue Ptr1, SDValue Ptr2) const;
+  bool isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+                             const Value *LoadValue) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 3dee406..4a509a3 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -210,13 +210,13 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
 
   // Find all return blocks, outputting a restore in each epilog.
   for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
-    if (!BB->empty() && BB->back().getDesc().isReturn()) {
+    if (!BB->empty() && BB->back().isReturn()) {
       IP = BB->end(); --IP;
 
       // Skip over all terminator instructions, which are part of the return
       // sequence.
       MachineBasicBlock::iterator I2 = IP;
-      while (I2 != BB->begin() && (--I2)->getDesc().isTerminator())
+      while (I2 != BB->begin() && (--I2)->isTerminator())
         IP = I2;
 
       // Emit: MTVRSAVE InVRSave
@@ -1066,7 +1066,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue Target = N->getOperand(1);
     unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
     unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
-    Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Target,
+    Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
                                            Chain), 0);
     return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
   }
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 36d5c41..f3a3d17 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -103,6 +103,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // from FP_ROUND:  that rounds to nearest, this rounds to zero.
   setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
 
+  // We do not currently implment this libm ops for PowerPC.
+  setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FRINT,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
+
   // PowerPC has no SREM/UREM instructions
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
@@ -146,9 +153,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
   setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
   setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
   setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
   setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
   // PowerPC does not have ROTR
   setOperationAction(ISD::ROTR, MVT::i32   , Expand);
@@ -332,7 +343,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
       setOperationAction(ISD::FPOW, VT, Expand);
       setOperationAction(ISD::CTPOP, VT, Expand);
       setOperationAction(ISD::CTLZ, VT, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::CTTZ, VT, Expand);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
     }
 
     // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
@@ -1667,7 +1680,8 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   // Potential tail calls could cause overwriting of argument stack slots.
-  bool isImmutable = !(GuaranteedTailCallOpt && (CallConv==CallingConv::Fast));
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = 4;
 
   // Assign locations to all of the incoming arguments.
@@ -1857,7 +1871,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = PtrVT == MVT::i64;
   // Potential tail calls could cause overwriting of argument stack slots.
-  bool isImmutable = !(GuaranteedTailCallOpt && (CallConv==CallingConv::Fast));
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
   unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
@@ -2263,9 +2278,9 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
                       PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
 
   // Tail call needs the stack to be aligned.
-  if (CC==CallingConv::Fast && GuaranteedTailCallOpt) {
-    unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()->
-      getStackAlignment();
+  if (CC == CallingConv::Fast && DAG.getTarget().Options.GuaranteedTailCallOpt){
+    unsigned TargetAlign = DAG.getMachineFunction().getTarget().
+      getFrameLowering()->getStackAlignment();
     unsigned AlignMask = TargetAlign-1;
     NumBytes = (NumBytes + AlignMask) & ~AlignMask;
   }
@@ -2299,7 +2314,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                      bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG& DAG) const {
-  if (!GuaranteedTailCallOpt)
+  if (!getTargetMachine().Options.GuaranteedTailCallOpt)
     return false;
 
   // Variable argument functions are not supported.
@@ -2752,7 +2767,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCRegisterInfo::eliminateCallFramePseudoInstr.
   int BytesCalleePops =
-    (CallConv==CallingConv::Fast && GuaranteedTailCallOpt) ? NumBytes : 0;
+    (CallConv == CallingConv::Fast &&
+     getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -2868,7 +2884,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
-  if (GuaranteedTailCallOpt && CallConv==CallingConv::Fast)
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   // Count how many bytes are to be pushed on the stack, including the linkage
@@ -3075,7 +3092,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
-  if (GuaranteedTailCallOpt && CallConv==CallingConv::Fast)
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   unsigned nAltivecParamsAtEnd = 0;
@@ -5754,7 +5772,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
-  bool is31 = (DisableFramePointerElim(MF) || MFI->hasVarSizedObjects()) &&
+  bool is31 = (getTargetMachine().Options.DisableFramePointerElim(MF) ||
+               MFI->hasVarSizedObjects()) &&
                   MFI->getStackSize() &&
                   !MF.getFunction()->hasFnAttr(Attribute::Naked);
   unsigned FrameReg = isPPC64 ? (is31 ? PPC::X31 : PPC::X1) :
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index e88ad37..cdbc264 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -223,6 +223,18 @@ def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
 def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
           (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
 
+// 64-but CR instructions
+def MTCRF8 : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins G8RC:$rS),
+                      "mtcrf $FXM, $rS", BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+
+def MFCR8pseud: XFXForm_3<31, 19, (outs G8RC:$rT), (ins crbitm:$FXM),
+                       "", SprMFCR>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+            
+def MFCR8 : XFXForm_3<31, 19, (outs G8RC:$rT), (ins),
+                     "mfcr $rT", SprMFCR>,
+                     PPC970_MicroCode, PPC970_Unit_CRU;
 
 //===----------------------------------------------------------------------===//
 // 64-bit SPR manipulation instrs.
@@ -469,6 +481,12 @@ def RLDICR : MDForm_1<30, 1,
                       (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME),
                       "rldicr $rA, $rS, $SH, $ME", IntRotateD,
                       []>, isPPC64;
+
+def RLWINM8 : MForm_2<21,
+                     (outs G8RC:$rA), (ins G8RC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>;
+
 }  // End FXU Operations.
 
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index b9a6297..6d16f1d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -33,8 +33,8 @@
 #include "PPCGenInstrInfo.inc"
 
 namespace llvm {
-extern cl::opt<bool> EnablePPC32RS;  // FIXME (64-bit): See PPCRegisterInfo.cpp.
-extern cl::opt<bool> EnablePPC64RS;  // FIXME (64-bit): See PPCRegisterInfo.cpp.
+extern cl::opt<bool> DisablePPC32RS;
+extern cl::opt<bool> DisablePPC64RS;
 }
 
 using namespace llvm;
@@ -48,25 +48,32 @@ PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
 ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
   const TargetMachine *TM,
   const ScheduleDAG *DAG) const {
-  // Should use subtarget info to pick the right hazard recognizer.  For
-  // now, always return a PPC970 recognizer.
-  const TargetInstrInfo *TII = TM->getInstrInfo();
-  (void)TII;
-  assert(TII && "No InstrInfo?");
-
   unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
   if (Directive == PPC::DIR_440) {
     const InstrItineraryData *II = TM->getInstrItineraryData();
     return new PPCHazardRecognizer440(II, DAG);
   }
-  else {
-    // Disable the hazard recognizer for now, as it doesn't support
-    // bottom-up scheduling.
-    //return new PPCHazardRecognizer970(*TII);
-    return new ScheduleHazardRecognizer();
-  }
+
+  return TargetInstrInfoImpl::CreateTargetHazardRecognizer(TM, DAG);
 }
 
+/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
+/// to use for this target when scheduling the DAG.
+ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
+  const InstrItineraryData *II,
+  const ScheduleDAG *DAG) const {
+  unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+
+  // Most subtargets use a PPC970 recognizer.
+  if (Directive != PPC::DIR_440) {
+    const TargetInstrInfo *TII = TM.getInstrInfo();
+    assert(TII && "No InstrInfo?");
+
+    return new PPCHazardRecognizer970(*TII);
+  }
+
+  return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
 unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                            int &FrameIndex) const {
   switch (MI->getOpcode()) {
@@ -338,6 +345,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
 }
 
+// This function returns true if a CR spill is necessary and false otherwise.
 bool
 PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                   unsigned SrcReg, bool isKill,
@@ -369,7 +377,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                          FrameIdx));
     } else {
       // FIXME: this spills LR immediately to memory in one step.  To do this,
-      // we use R11, which we know cannot be used in the prolog/epilog.  This is
+      // we use X11, which we know cannot be used in the prolog/epilog.  This is
       // a hack.
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFLR8), PPC::X11));
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
@@ -388,9 +396,8 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                getKillRegState(isKill)),
                                        FrameIdx));
   } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
-    if ((EnablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
-        (EnablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
-      // FIXME (64-bit): Enable
+    if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
+        (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
                                          .addReg(SrcReg,
                                                  getKillRegState(isKill)),
@@ -403,11 +410,14 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
       // We hack this on Darwin by reserving R2.  It's probably broken on Linux
       // at the moment.
 
+      bool is64Bit = TM.getSubtargetImpl()->isPPC64();
       // We need to store the CR in the low 4-bits of the saved value.  First,
       // issue a MFCR to save all of the CRBits.
       unsigned ScratchReg = TM.getSubtargetImpl()->isDarwinABI() ?
-                                                           PPC::R2 : PPC::R0;
-      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFCRpseud), ScratchReg)
+                              (is64Bit ? PPC::X2 : PPC::R2) :
+                              (is64Bit ? PPC::X0 : PPC::R0);
+      NewMIs.push_back(BuildMI(MF, DL, get(is64Bit ? PPC::MFCR8pseud :
+                                             PPC::MFCRpseud), ScratchReg)
                                .addReg(SrcReg, getKillRegState(isKill)));
 
       // If the saved register wasn't CR0, shift the bits left so that they are
@@ -415,12 +425,14 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
       if (SrcReg != PPC::CR0) {
         unsigned ShiftBits = getPPCRegisterNumbering(SrcReg)*4;
         // rlwinm scratch, scratch, ShiftBits, 0, 31.
-        NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
+        NewMIs.push_back(BuildMI(MF, DL, get(is64Bit ? PPC::RLWINM8 :
+                           PPC::RLWINM), ScratchReg)
                        .addReg(ScratchReg).addImm(ShiftBits)
                        .addImm(0).addImm(31));
       }
 
-      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(is64Bit ?
+                                           PPC::STW8 : PPC::STW))
                                          .addReg(ScratchReg,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
@@ -504,7 +516,7 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
-void
+bool
 PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
@@ -524,8 +536,8 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                          FrameIdx));
     } else {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD),
-                                                 PPC::R11), FrameIdx));
-      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::R11));
+                                                 PPC::X11), FrameIdx));
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::X11));
     }
   } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
@@ -534,28 +546,37 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
                                        FrameIdx));
   } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
-    // FIXME: We need a scatch reg here.  The trouble with using R0 is that
-    // it's possible for the stack frame to be so big the save location is
-    // out of range of immediate offsets, necessitating another register.
-    // We hack this on Darwin by reserving R2.  It's probably broken on Linux
-    // at the moment.
-    unsigned ScratchReg = TM.getSubtargetImpl()->isDarwinABI() ?
-                                                          PPC::R2 : PPC::R0;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
-                                       ScratchReg), FrameIdx));
-
-    // If the reloaded register isn't CR0, shift the bits right so that they are
-    // in the right CR's slot.
-    if (DestReg != PPC::CR0) {
-      unsigned ShiftBits = getPPCRegisterNumbering(DestReg)*4;
-      // rlwinm r11, r11, 32-ShiftBits, 0, 31.
-      NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
-                    .addReg(ScratchReg).addImm(32-ShiftBits).addImm(0)
-                    .addImm(31));
+    if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
+        (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                                 get(PPC::RESTORE_CR), DestReg)
+                                         , FrameIdx));
+      return true;
+    } else {
+      // FIXME: We need a scatch reg here.  The trouble with using R0 is that
+      // it's possible for the stack frame to be so big the save location is
+      // out of range of immediate offsets, necessitating another register.
+      // We hack this on Darwin by reserving R2.  It's probably broken on Linux
+      // at the moment.
+      unsigned ScratchReg = TM.getSubtargetImpl()->isDarwinABI() ?
+                                                            PPC::R2 : PPC::R0;
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                         ScratchReg), FrameIdx));
+  
+      // If the reloaded register isn't CR0, shift the bits right so that they are
+      // in the right CR's slot.
+      if (DestReg != PPC::CR0) {
+        unsigned ShiftBits = getPPCRegisterNumbering(DestReg)*4;
+        // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+        NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
+                      .addReg(ScratchReg).addImm(32-ShiftBits).addImm(0)
+                      .addImm(31));
+      }
+  
+      NewMIs.push_back(BuildMI(MF, DL, get(TM.getSubtargetImpl()->isPPC64() ?
+                         PPC::MTCRF8 : PPC::MTCRF), DestReg)
+                       .addReg(ScratchReg));
     }
-
-    NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTCRF), DestReg)
-                     .addReg(ScratchReg));
   } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
 
     unsigned Reg = 0;
@@ -600,6 +621,8 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
   } else {
     llvm_unreachable("Unknown regclass!");
   }
+
+  return false;
 }
 
 void
@@ -612,7 +635,10 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   SmallVector<MachineInstr*, 4> NewMIs;
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
-  LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
+  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs)) {
+    PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+    FuncInfo->setSpillsCR();
+  }
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
     MBB.insert(MI, NewMIs[i]);
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 90bacc9..e90f8cb 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -72,7 +72,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                            unsigned SrcReg, bool isKill, int FrameIdx,
                            const TargetRegisterClass *RC,
                            SmallVectorImpl<MachineInstr*> &NewMIs) const;
-  void LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+  bool LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                             unsigned DestReg, int FrameIdx,
                             const TargetRegisterClass *RC,
                             SmallVectorImpl<MachineInstr*> &NewMIs) const;
@@ -88,6 +88,9 @@ public:
   ScheduleHazardRecognizer *
   CreateTargetHazardRecognizer(const TargetMachine *TM,
                                const ScheduleDAG *DAG) const;
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const;
 
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 17f63e0..d4c9d10 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -349,7 +349,7 @@ def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
-def FPContractions : Predicate<"!NoExcessFPPrecision">;
+def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
 def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
 def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
 def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
@@ -399,7 +399,14 @@ let usesCustomInserter = 1,    // Expanded after instruction selection.
 
 // SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
 // scavenge a register for it.
-def SPILL_CR : Pseudo<(outs), (ins GPRC:$cond, memri:$F),
+let mayStore = 1 in
+def SPILL_CR : Pseudo<(outs), (ins CRRC:$cond, memri:$F),
+                     "", []>;
+
+// RESTORE_CR - Indicate that we're restoring the CR register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in
+def RESTORE_CR : Pseudo<(outs CRRC:$cond), (ins memri:$F),
                      "", []>;
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
@@ -1091,7 +1098,7 @@ def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs GPRC:$rT), (ins),
                              "mfspr $rT, 256", IntGeneral>,
                PPC970_DGroup_First, PPC970_Unit_FXU;
 
-def MTCRF : XFXForm_5<31, 144, (outs), (ins crbitm:$FXM, GPRC:$rS),
+def MTCRF : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins GPRC:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 3ba9260..27f7f4a 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -46,15 +46,14 @@
 #define GET_REGINFO_TARGET_DESC
 #include "PPCGenRegisterInfo.inc"
 
-// FIXME (64-bit): Eventually enable by default.
 namespace llvm {
-cl::opt<bool> EnablePPC32RS("enable-ppc32-regscavenger",
+cl::opt<bool> DisablePPC32RS("disable-ppc32-regscavenger",
                                    cl::init(false),
-                                   cl::desc("Enable PPC32 register scavenger"),
+                                   cl::desc("Disable PPC32 register scavenger"),
                                    cl::Hidden);
-cl::opt<bool> EnablePPC64RS("enable-ppc64-regscavenger",
+cl::opt<bool> DisablePPC64RS("disable-ppc64-regscavenger",
                                    cl::init(false),
-                                   cl::desc("Enable PPC64 register scavenger"),
+                                   cl::desc("Disable PPC64 register scavenger"),
                                    cl::Hidden);
 }
 
@@ -63,8 +62,8 @@ using namespace llvm;
 // FIXME (64-bit): Should be inlined.
 bool
 PPCRegisterInfo::requiresRegisterScavenging(const MachineFunction &) const {
-  return ((EnablePPC32RS && !Subtarget.isPPC64()) ||
-          (EnablePPC64RS && Subtarget.isPPC64()));
+  return ((!DisablePPC32RS && !Subtarget.isPPC64()) ||
+          (!DisablePPC64RS && Subtarget.isPPC64()));
 }
 
 PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
@@ -120,10 +119,6 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     PPC::V24, PPC::V25, PPC::V26, PPC::V27,
     PPC::V28, PPC::V29, PPC::V30, PPC::V31,
     
-    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
-    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
-    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
-    
     PPC::LR,  0
   };
 
@@ -149,10 +144,6 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     PPC::V24, PPC::V25, PPC::V26, PPC::V27,
     PPC::V28, PPC::V29, PPC::V30, PPC::V31,
     
-    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
-    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
-    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
-    
     0
   };
   // 64-bit Darwin calling convention. 
@@ -174,10 +165,6 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     PPC::V24, PPC::V25, PPC::V26, PPC::V27,
     PPC::V28, PPC::V29, PPC::V30, PPC::V31,
     
-    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
-    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
-    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
-    
     PPC::LR8,  0
   };
 
@@ -203,10 +190,6 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     PPC::V24, PPC::V25, PPC::V26, PPC::V27,
     PPC::V28, PPC::V29, PPC::V30, PPC::V31,
 
-    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
-    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
-    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
-
     0
   };
   
@@ -247,9 +230,6 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(PPC::R13);
     Reserved.set(PPC::R31);
 
-    if (!requiresRegisterScavenging(MF))
-      Reserved.set(PPC::R0);    // FIXME (64-bit): Remove
-
     Reserved.set(PPC::X0);
     Reserved.set(PPC::X1);
     Reserved.set(PPC::X13);
@@ -259,7 +239,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     if (Subtarget.isSVR4ABI()) {
       Reserved.set(PPC::X2);
     }
-    // Reserve R2 on Darwin to hack around the problem of save/restore of CR
+    // Reserve X2 on Darwin to hack around the problem of save/restore of CR
     // when the stack frame is too big to address directly; we need two regs.
     // This is a hack.
     if (Subtarget.isDarwinABI()) {
@@ -291,6 +271,8 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case PPC::F4RCRegClassID:
   case PPC::VRRCRegClassID:
     return 32 - DefaultSafety;
+  case PPC::CRRCRegClassID:
+    return 8 - DefaultSafety;
   }
 }
 
@@ -301,7 +283,8 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 void PPCRegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  if (GuaranteedTailCallOpt && I->getOpcode() == PPC::ADJCALLSTACKUP) {
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      I->getOpcode() == PPC::ADJCALLSTACKUP) {
     // Add (actually subtract) back the amount the callee popped on return.
     if (int CalleeAmt =  I->getOperand(1).getImm()) {
       bool is64Bit = Subtarget.isPPC64();
@@ -476,28 +459,32 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
                                       unsigned FrameIndex, int SPAdj,
                                       RegScavenger *RS) const {
   // Get the instruction.
-  MachineInstr &MI = *II;       // ; SPILL_CR <SrcReg>, <offset>, <FI>
+  MachineInstr &MI = *II;       // ; SPILL_CR <SrcReg>, <offset>
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc dl = MI.getDebugLoc();
 
-  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
-  unsigned Reg = findScratchRegister(II, RS, RC, SPAdj);
-  unsigned SrcReg = MI.getOperand(0).getReg();
+  // FIXME: Once LLVM supports creating virtual registers here, or the register
+  // scavenger can return multiple registers, stop using reserved registers
+  // here.
+  (void) SPAdj;
+  (void) RS;
+
   bool LP64 = Subtarget.isPPC64();
+  unsigned Reg = Subtarget.isDarwinABI() ?  (LP64 ? PPC::X2 : PPC::R2) :
+                                            (LP64 ? PPC::X0 : PPC::R0);
+  unsigned SrcReg = MI.getOperand(0).getReg();
 
   // We need to store the CR in the low 4-bits of the saved value. First, issue
   // an MFCRpsued to save all of the CRBits and, if needed, kill the SrcReg.
-  BuildMI(MBB, II, dl, TII.get(PPC::MFCRpseud), Reg)
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFCR8pseud : PPC::MFCRpseud), Reg)
           .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
     
   // If the saved register wasn't CR0, shift the bits left so that they are in
   // CR0's slot.
   if (SrcReg != PPC::CR0)
     // rlwinm rA, rA, ShiftBits, 0, 31.
-    BuildMI(MBB, II, dl, TII.get(PPC::RLWINM), Reg)
+    BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
       .addReg(Reg, RegState::Kill)
       .addImm(getPPCRegisterNumbering(SrcReg) * 4)
       .addImm(0)
@@ -511,6 +498,48 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
+void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex, int SPAdj,
+                                      RegScavenger *RS) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_CR <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // FIXME: Once LLVM supports creating virtual registers here, or the register
+  // scavenger can return multiple registers, stop using reserved registers
+  // here.
+  (void) SPAdj;
+  (void) RS;
+
+  bool LP64 = Subtarget.isPPC64();
+  unsigned Reg = Subtarget.isDarwinABI() ?  (LP64 ? PPC::X2 : PPC::R2) :
+                                            (LP64 ? PPC::X0 : PPC::R0);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_CR does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+                              Reg), FrameIndex);
+
+  // If the reloaded register isn't CR0, shift the bits right so that they are
+  // in the right CR's slot.
+  if (DestReg != PPC::CR0) {
+    unsigned ShiftBits = getPPCRegisterNumbering(DestReg)*4;
+    // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(PPC::RLWINM), Reg)
+             .addReg(Reg).addImm(32-ShiftBits).addImm(0)
+             .addImm(31);
+  }
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTCRF8 : PPC::MTCRF), DestReg)
+             .addReg(Reg);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
 void
 PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, RegScavenger *RS) const {
@@ -556,16 +585,23 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     return;
   }
 
-  // Special case for pseudo-op SPILL_CR.
-  if (requiresRegisterScavenging(MF)) // FIXME (64-bit): Enable by default.
+  // Special case for pseudo-ops SPILL_CR and RESTORE_CR.
+  if (requiresRegisterScavenging(MF)) {
     if (OpC == PPC::SPILL_CR) {
       lowerCRSpilling(II, FrameIndex, SPAdj, RS);
       return;
+    } else if (OpC == PPC::RESTORE_CR) {
+      lowerCRRestore(II, FrameIndex, SPAdj, RS);
+      return;
     }
+  }
 
   // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+
+  bool is64Bit = Subtarget.isPPC64();
   MI.getOperand(FIOperandNo).ChangeToRegister(TFI->hasFP(MF) ?
-                                              PPC::R31 : PPC::R1,
+                                              (is64Bit ? PPC::X31 : PPC::R31) :
+                                                (is64Bit ? PPC::X1 : PPC::R1),
                                               false);
 
   // Figure out if the offset in the instruction is shifted right two bits. This
@@ -611,19 +647,19 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // The offset doesn't fit into a single register, scavenge one to build the
   // offset in.
-  // FIXME: figure out what SPAdj is doing here.
 
-  // FIXME (64-bit): Use "findScratchRegister".
   unsigned SReg;
-  if (requiresRegisterScavenging(MF))
-    SReg = findScratchRegister(II, RS, &PPC::GPRCRegClass, SPAdj);
-  else
-    SReg = PPC::R0;
+  if (requiresRegisterScavenging(MF)) {
+    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+    SReg = findScratchRegister(II, RS, is64Bit ? G8RC : GPRC, SPAdj);
+  } else
+    SReg = is64Bit ? PPC::X0 : PPC::R0;
 
   // Insert a set of rA with the full offset value before the ld, st, or add
-  BuildMI(MBB, II, dl, TII.get(PPC::LIS), SReg)
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SReg)
     .addImm(Offset >> 16);
-  BuildMI(MBB, II, dl, TII.get(PPC::ORI), SReg)
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
     .addReg(SReg, RegState::Kill)
     .addImm(Offset);
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index f70a594..faf690f 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -57,6 +57,8 @@ public:
                          int SPAdj, RegScavenger *RS) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex,
                        int SPAdj, RegScavenger *RS) const;
+  void lowerCRRestore(MachineBasicBlock::iterator II, unsigned FrameIndex,
+                       int SPAdj, RegScavenger *RS) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, RegScavenger *RS = NULL) const;
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 8acf75c..baa0eb5 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCSubtarget.h"
+#include "PPCRegisterInfo.h"
 #include "PPC.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
@@ -140,3 +141,22 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV,
   return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
          GV->hasCommonLinkage() || isDecl;
 }
+
+bool PPCSubtarget::enablePostRAScheduler(
+           CodeGenOpt::Level OptLevel,
+           TargetSubtargetInfo::AntiDepBreakMode& Mode,
+           RegClassVector& CriticalPathRCs) const {
+  if (DarwinDirective == PPC::DIR_440)
+    return false;
+
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+
+  if (isPPC64())
+    CriticalPathRCs.push_back(&PPC::G8RCRegClass);
+  else
+    CriticalPathRCs.push_back(&PPC::GPRCRegClass);
+
+  return OptLevel >= CodeGenOpt::Default;
+}
+
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index d2b853d..62b2424 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -148,6 +148,10 @@ public:
   bool isDarwinABI() const { return isDarwin(); }
   bool isSVR4ABI() const { return !isDarwin(); }
 
+  /// enablePostRAScheduler - True at 'More' optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index de8fca0..8e71c46 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -28,10 +28,11 @@ extern "C" void LLVMInitializePowerPCTarget() {
 
 PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64Bit),
     DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
@@ -45,17 +46,19 @@ bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
 
 PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT, 
                                        StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
 }
 
 
 PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT, 
                                        StringRef CPU,  StringRef FS,
+                                       const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
 }
 
 
@@ -81,7 +84,7 @@ bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
   if (Subtarget.isPPC64())
     // Temporary workaround for the inability of PPC64 JIT to handle jump
     // tables.
-    DisableJumpTables = true;      
+    Options.DisableJumpTables = true;      
   
   // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
   // writing?
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 03b27c6..0427876 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -41,7 +41,7 @@ class PPCTargetMachine : public LLVMTargetMachine {
 
 public:
   PPCTargetMachine(const Target &T, StringRef TT,
-                   StringRef CPU, StringRef FS,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL, bool is64Bit);
 
@@ -79,7 +79,7 @@ public:
 class PPC32TargetMachine : public PPCTargetMachine {
 public:
   PPC32TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
@@ -89,7 +89,7 @@ public:
 class PPC64TargetMachine : public PPCTargetMachine {
 public:
   PPC64TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
diff --git a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
index f63111f..fdb8a62 100644
--- a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
+++ b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMPowerPCInfo
   PowerPCTargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMPowerPCInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMPowerPCInfo PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt b/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt
index f51b417..f77d85b 100644
--- a/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = PowerPCInfo
 parent = PowerPC
 required_libraries = MC Support Target
 add_to_library_groups = PowerPC
-
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index 9687951..56ee7c2 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -22,17 +22,5 @@ add_llvm_target(SparcCodeGen
   SparcSelectionDAGInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMSparcCodeGen
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMSelectionDAG
-  LLVMSparcDesc
-  LLVMSparcInfo
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index dab35e5..9295408 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -100,7 +100,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
-    if (I->getDesc().hasDelaySlot()) {
+    if (I->hasDelaySlot()) {
       MachineBasicBlock::iterator D = MBB.end();
       MachineBasicBlock::iterator J = I;
 
@@ -149,7 +149,7 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
   }
 
   //Call's delay filler can def some of call's uses.
-  if (slot->getDesc().isCall())
+  if (slot->isCall())
     insertCallUses(slot, RegUses);
   else
     insertDefsUses(slot, RegDefs, RegUses);
@@ -170,7 +170,7 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
     if (I->hasUnmodeledSideEffects()
         || I->isInlineAsm()
         || I->isLabel()
-        || I->getDesc().hasDelaySlot()
+        || I->hasDelaySlot()
         || isDelayFiller(MBB, I))
       break;
 
@@ -194,13 +194,13 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
   if (candidate->isImplicitDef() || candidate->isKill())
     return true;
 
-  if (candidate->getDesc().mayLoad()) {
+  if (candidate->mayLoad()) {
     sawLoad = true;
     if (sawStore)
       return true;
   }
 
-  if (candidate->getDesc().mayStore()) {
+  if (candidate->mayStore()) {
     if (sawStore)
       return true;
     sawStore = true;
@@ -298,13 +298,13 @@ bool Filler::isDelayFiller(MachineBasicBlock &MBB,
     return false;
   if (candidate->getOpcode() == SP::UNIMP)
     return true;
-  const MCInstrDesc &prevdesc = (--candidate)->getDesc();
-  return prevdesc.hasDelaySlot();
+  --candidate;
+  return candidate->hasDelaySlot();
 }
 
 bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
 {
-  if (!I->getDesc().isCall())
+  if (!I->isCall())
     return false;
 
   unsigned structSizeOpNum = 0;
diff --git a/lib/Target/Sparc/LLVMBuild.txt b/lib/Target/Sparc/LLVMBuild.txt
index 38c797f..fe20d2f 100644
--- a/lib/Target/Sparc/LLVMBuild.txt
+++ b/lib/Target/Sparc/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = Sparc
@@ -27,4 +30,3 @@ name = SparcCodeGen
 parent = Sparc
 required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SparcDesc SparcInfo Support Target
 add_to_library_groups = Sparc
-
diff --git a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
index d3bdf0b..9d4db4d 100644
--- a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
@@ -3,10 +3,4 @@ add_llvm_library(LLVMSparcDesc
   SparcMCAsmInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMSparcDesc
-  LLVMMC
-  LLVMSparcInfo
-  LLVMSupport
-  )
-
 add_dependencies(LLVMSparcDesc SparcCommonTableGen)
diff --git a/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt b/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt
index a339cec..97f8f16 100644
--- a/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = SparcDesc
 parent = Sparc
 required_libraries = MC SparcInfo Support
 add_to_library_groups = Sparc
-
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index deb39d9..7548bbf 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -236,9 +236,9 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   
   // Check if the last terminator is an unconditional branch.
   MachineBasicBlock::const_iterator I = Pred->end();
-  while (I != Pred->begin() && !(--I)->getDesc().isTerminator())
+  while (I != Pred->begin() && !(--I)->isTerminator())
     ; // Noop
-  return I == Pred->end() || !I->getDesc().isBarrier();
+  return I == Pred->end() || !I->isBarrier();
 }
 
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 25104d1..3608d3b 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -763,7 +763,9 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FMA  , MVT::f32, Expand);
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 7a6bf50..5290d42 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -133,7 +133,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       break;
 
     //Terminator is not a branch
-    if (!I->getDesc().isBranch())
+    if (!I->isBranch())
       return true;
 
     //Handle Unconditional branches
@@ -195,7 +195,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
           .addMBB(UnCondBrIter->getOperand(0).getMBB()).addImm(BranchCode);
         BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(SP::BA))
           .addMBB(TargetBB);
-        MBB.addSuccessor(TargetBB);
+
         OldInst->eraseFromParent();
         UnCondBrIter->eraseFromParent();
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 7dff799..8e16fd7 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -26,10 +26,11 @@ extern "C" void LLVMInitializeSparcTarget() {
 ///
 SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, 
                                        StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL,
                                        bool is64bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64bit),
     DataLayout(Subtarget.getDataLayout()),
     TLInfo(*this), TSInfo(*this), InstrInfo(Subtarget),
@@ -52,16 +53,20 @@ bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM){
 
 SparcV8TargetMachine::SparcV8TargetMachine(const Target &T,
                                            StringRef TT, StringRef CPU,
-                                           StringRef FS, Reloc::Model RM,
+                                           StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
                                            CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
-  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {
+  : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
 }
 
 SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, 
                                            StringRef TT,  StringRef CPU,
-                                           StringRef FS, Reloc::Model RM,
+                                           StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
                                            CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
-  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {
+  : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
 }
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 63bfa5d..cedc1e3 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -34,9 +34,9 @@ class SparcTargetMachine : public LLVMTargetMachine {
   SparcFrameLowering FrameLowering;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL,  bool is64bit);
+                     CodeGenOpt::Level OL, bool is64bit);
 
   virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameLowering  *getFrameLowering() const {
@@ -65,6 +65,7 @@ class SparcV8TargetMachine : public SparcTargetMachine {
 public:
   SparcV8TargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
@@ -75,6 +76,7 @@ class SparcV9TargetMachine : public SparcTargetMachine {
 public:
   SparcV9TargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
diff --git a/lib/Target/Sparc/TargetInfo/CMakeLists.txt b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
index a076023..b0d031e 100644
--- a/lib/Target/Sparc/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMSparcInfo
   SparcTargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMSparcInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMSparcInfo SparcCommonTableGen)
diff --git a/lib/Target/Sparc/TargetInfo/LLVMBuild.txt b/lib/Target/Sparc/TargetInfo/LLVMBuild.txt
index 81c9032..b5c320f 100644
--- a/lib/Target/Sparc/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/Sparc/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = SparcInfo
 parent = Sparc
 required_libraries = MC Support Target
 add_to_library_groups = Sparc
-
diff --git a/lib/Target/TargetFrameLowering.cpp b/lib/Target/TargetFrameLowering.cpp
deleted file mode 100644
index 122f869..0000000
--- a/lib/Target/TargetFrameLowering.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//===----- TargetFrameLowering.cpp - Implement target frame interface ------==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements the layout of a stack frame on the target machine.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
-#include <cstdlib>
-using namespace llvm;
-
-TargetFrameLowering::~TargetFrameLowering() {
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index. This is the default implementation
-/// which is overridden for some targets.
-int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
-    getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
-}
-
-int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                             int FI, unsigned &FrameReg) const {
-  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
-
-  // By default, assume all frame indices are referenced via whatever
-  // getFrameRegister() says. The target can override this if it's doing
-  // something different.
-  FrameReg = RI->getFrameRegister(MF);
-  return getFrameIndexOffset(MF, FI);
-}
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index d52ecb3..440f9ad 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -73,23 +72,6 @@ TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
 }
 
-int
-TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                   SDNode *DefNode, unsigned DefIdx,
-                                   SDNode *UseNode, unsigned UseIdx) const {
-  if (!ItinData || ItinData->isEmpty())
-    return -1;
-
-  if (!DefNode->isMachineOpcode())
-    return -1;
-
-  unsigned DefClass = get(DefNode->getMachineOpcode()).getSchedClass();
-  if (!UseNode->isMachineOpcode())
-    return ItinData->getOperandCycle(DefClass, DefIdx);
-  unsigned UseClass = get(UseNode->getMachineOpcode()).getSchedClass();
-  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
-}
-
 int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                      const MachineInstr *MI,
                                      unsigned *PredCost) const {
@@ -99,17 +81,6 @@ int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return ItinData->getStageLatency(MI->getDesc().getSchedClass());
 }
 
-int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                     SDNode *N) const {
-  if (!ItinData || ItinData->isEmpty())
-    return 1;
-
-  if (!N->isMachineOpcode())
-    return 1;
-
-  return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass());
-}
-
 bool TargetInstrInfo::hasLowDefLatency(const InstrItineraryData *ItinData,
                                        const MachineInstr *DefMI,
                                        unsigned DefIdx) const {
@@ -129,19 +100,6 @@ void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
 }
 
 
-bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.isTerminator()) return false;
-
-  // Conditional branch is a special case.
-  if (MCID.isBranch() && !MCID.isBarrier())
-    return true;
-  if (!MCID.isPredicable())
-    return true;
-  return !isPredicated(MI);
-}
-
-
 /// Measure the specified inline asm to determine an approximation of its
 /// length.
 /// Comments (which run till the next SeparatorString or newline) do not
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index aa2e014..768facb 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -22,15 +22,96 @@ char TargetLibraryInfo::ID = 0;
 
 const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
   {
-    "memset",
+    "acos",
+    "acosl",
+    "acosf",
+    "asin",
+    "asinl",
+    "asinf",
+    "atan",
+    "atanl",
+    "atanf",
+    "atan2",
+    "atan2l",
+    "atan2f",
+    "ceil",
+    "ceill",
+    "ceilf",
+    "copysign",
+    "copysignf",
+    "copysignl",
+    "cos",
+    "cosl",
+    "cosf",
+    "cosh",
+    "coshl",
+    "coshf",
+    "exp",
+    "expl",
+    "expf",
+    "exp2",
+    "exp2l",
+    "exp2f",
+    "expm1",
+    "expm1l",
+    "expl1f",
+    "fabs",
+    "fabsl",
+    "fabsf",
+    "floor",
+    "floorl",
+    "floorf",
+    "fiprintf",
+    "fmod",
+    "fmodl",
+    "fmodf",
+    "fputs",
+    "fwrite",
+    "iprintf",
+    "log",
+    "logl",
+    "logf",
+    "log2",
+    "log2l",
+    "log2f",
+    "log10",
+    "log10l",
+    "log10f",
+    "log1p",
+    "log1pl",
+    "log1pf",
     "memcpy",
     "memmove",
+    "memset",
     "memset_pattern16",
-    "iprintf",
+    "nearbyint",
+    "nearbyintf",
+    "nearbyintl",
+    "pow",
+    "powf",
+    "powl",
+    "rint",
+    "rintf",
+    "rintl",
+    "sin",
+    "sinl",
+    "sinf",
+    "sinh",
+    "sinhl",
+    "sinhf",
     "siprintf",
-    "fiprintf",
-    "fwrite",
-    "fputs"
+    "sqrt",
+    "sqrtl",
+    "sqrtf",
+    "tan",
+    "tanl",
+    "tanf",
+    "tanh",
+    "tanhl",
+    "tanhf",
+    "trunc",
+    "truncf",
+    "truncl"
   };
 
 /// initialize - Initialize the set of available library functions based on the
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 56b7b69..fc8b67b 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -48,7 +48,7 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx,
 TargetLoweringObjectFile::~TargetLoweringObjectFile() {
 }
 
-static bool isSuitableForBSS(const GlobalVariable *GV) {
+static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
   const Constant *C = GV->getInitializer();
 
   // Must have zero initializer.
@@ -133,7 +133,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
 
   // Handle thread-local data first.
   if (GVar->isThreadLocal()) {
-    if (isSuitableForBSS(GVar))
+    if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS))
       return SectionKind::getThreadBSS();
     return SectionKind::getThreadData();
   }
@@ -143,7 +143,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
     return SectionKind::getCommon();
 
   // Variable can be easily put to BSS section.
-  if (isSuitableForBSS(GVar)) {
+  if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS)) {
     if (GVar->hasLocalLinkage())
       return SectionKind::getBSSLocal();
     else if (GVar->hasExternalLinkage())
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 805e16e..fb7bbbb 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -24,153 +22,11 @@ using namespace llvm;
 //
 
 namespace llvm {
-  bool LessPreciseFPMADOption;
-  bool PrintMachineCode;
-  bool NoFramePointerElim;
-  bool NoFramePointerElimNonLeaf;
-  bool NoExcessFPPrecision;
-  bool UnsafeFPMath;
-  bool NoInfsFPMath;
-  bool NoNaNsFPMath;
-  bool HonorSignDependentRoundingFPMathOption;
-  bool UseSoftFloat;
-  FloatABI::ABIType FloatABIType;
-  bool NoImplicitFloat;
-  bool NoZerosInBSS;
-  bool JITExceptionHandling;
-  bool JITEmitDebugInfo;
-  bool JITEmitDebugInfoToDisk;
-  bool GuaranteedTailCallOpt;
-  unsigned StackAlignmentOverride;
-  bool RealignStack;
-  bool DisableJumpTables;
   bool StrongPHIElim;
   bool HasDivModLibcall;
   bool AsmVerbosityDefault(false);
-  bool EnableSegmentedStacks;
 }
 
-static cl::opt<bool, true>
-PrintCode("print-machineinstrs",
-  cl::desc("Print generated machine code"),
-  cl::location(PrintMachineCode), cl::init(false));
-static cl::opt<bool, true>
-DisableFPElim("disable-fp-elim",
-  cl::desc("Disable frame pointer elimination optimization"),
-  cl::location(NoFramePointerElim),
-  cl::init(false));
-static cl::opt<bool, true>
-DisableFPElimNonLeaf("disable-non-leaf-fp-elim",
-  cl::desc("Disable frame pointer elimination optimization for non-leaf funcs"),
-  cl::location(NoFramePointerElimNonLeaf),
-  cl::init(false));
-static cl::opt<bool, true>
-DisableExcessPrecision("disable-excess-fp-precision",
-  cl::desc("Disable optimizations that may increase FP precision"),
-  cl::location(NoExcessFPPrecision),
-  cl::init(false));
-static cl::opt<bool, true>
-EnableFPMAD("enable-fp-mad",
-  cl::desc("Enable less precise MAD instructions to be generated"),
-  cl::location(LessPreciseFPMADOption),
-  cl::init(false));
-static cl::opt<bool, true>
-EnableUnsafeFPMath("enable-unsafe-fp-math",
-  cl::desc("Enable optimizations that may decrease FP precision"),
-  cl::location(UnsafeFPMath),
-  cl::init(false));
-static cl::opt<bool, true>
-EnableNoInfsFPMath("enable-no-infs-fp-math",
-  cl::desc("Enable FP math optimizations that assume no +-Infs"),
-  cl::location(NoInfsFPMath),
-  cl::init(false));
-static cl::opt<bool, true>
-EnableNoNaNsFPMath("enable-no-nans-fp-math",
-  cl::desc("Enable FP math optimizations that assume no NaNs"),
-  cl::location(NoNaNsFPMath),
-  cl::init(false));
-static cl::opt<bool, true>
-EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
-  cl::Hidden,
-  cl::desc("Force codegen to assume rounding mode can change dynamically"),
-  cl::location(HonorSignDependentRoundingFPMathOption),
-  cl::init(false));
-static cl::opt<bool, true>
-GenerateSoftFloatCalls("soft-float",
-  cl::desc("Generate software floating point library calls"),
-  cl::location(UseSoftFloat),
-  cl::init(false));
-static cl::opt<llvm::FloatABI::ABIType, true>
-FloatABIForCalls("float-abi",
-  cl::desc("Choose float ABI type"),
-  cl::location(FloatABIType),
-  cl::init(FloatABI::Default),
-  cl::values(
-    clEnumValN(FloatABI::Default, "default",
-               "Target default float ABI type"),
-    clEnumValN(FloatABI::Soft, "soft",
-               "Soft float ABI (implied by -soft-float)"),
-    clEnumValN(FloatABI::Hard, "hard",
-               "Hard float ABI (uses FP registers)"),
-    clEnumValEnd));
-static cl::opt<bool, true>
-DontPlaceZerosInBSS("nozero-initialized-in-bss",
-  cl::desc("Don't place zero-initialized symbols into bss section"),
-  cl::location(NoZerosInBSS),
-  cl::init(false));
-static cl::opt<bool, true>
-EnableJITExceptionHandling("jit-enable-eh",
-  cl::desc("Emit exception handling information"),
-  cl::location(JITExceptionHandling),
-  cl::init(false));
-// In debug builds, make this default to true.
-#ifdef NDEBUG
-#define EMIT_DEBUG false
-#else
-#define EMIT_DEBUG true
-#endif
-static cl::opt<bool, true>
-EmitJitDebugInfo("jit-emit-debug",
-  cl::desc("Emit debug information to debugger"),
-  cl::location(JITEmitDebugInfo),
-  cl::init(EMIT_DEBUG));
-#undef EMIT_DEBUG
-static cl::opt<bool, true>
-EmitJitDebugInfoToDisk("jit-emit-debug-to-disk",
-  cl::Hidden,
-  cl::desc("Emit debug info objfiles to disk"),
-  cl::location(JITEmitDebugInfoToDisk),
-  cl::init(false));
-
-static cl::opt<bool, true>
-EnableGuaranteedTailCallOpt("tailcallopt",
-  cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
-  cl::location(GuaranteedTailCallOpt),
-  cl::init(false));
-static cl::opt<unsigned, true>
-OverrideStackAlignment("stack-alignment",
-  cl::desc("Override default stack alignment"),
-  cl::location(StackAlignmentOverride),
-  cl::init(0));
-static cl::opt<bool, true>
-EnableRealignStack("realign-stack",
-  cl::desc("Realign stack if needed"),
-  cl::location(RealignStack),
-  cl::init(true));
-static cl::opt<bool, true>
-DisableSwitchTables(cl::Hidden, "disable-jump-tables", 
-  cl::desc("Do not generate jump tables."),
-  cl::location(DisableJumpTables),
-  cl::init(false));
-static cl::opt<bool, true>
-EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
-  cl::desc("Use strong PHI elimination."),
-  cl::location(StrongPHIElim),
-  cl::init(false));
-static cl::opt<std::string>
-TrapFuncName("trap-func", cl::Hidden,
-  cl::desc("Emit a call to trap function rather than a trap instruction"),
-  cl::init(""));
 static cl::opt<bool>
 DataSections("fdata-sections",
   cl::desc("Emit data into separate sections"),
@@ -179,18 +35,14 @@ static cl::opt<bool>
 FunctionSections("ffunction-sections",
   cl::desc("Emit functions into separate sections"),
   cl::init(false));
-static cl::opt<bool, true>
-SegmentedStacks("segmented-stacks",
-  cl::desc("Use segmented stacks if possible."),
-  cl::location(EnableSegmentedStacks),
-  cl::init(false));
                          
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
 
 TargetMachine::TargetMachine(const Target &T,
-                             StringRef TT, StringRef CPU, StringRef FS)
+                             StringRef TT, StringRef CPU, StringRef FS,
+                             const TargetOptions &Options)
   : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS),
     CodeGenInfo(0), AsmInfo(0),
     MCRelaxAll(false),
@@ -198,11 +50,8 @@ TargetMachine::TargetMachine(const Target &T,
     MCSaveTempLabels(false),
     MCUseLoc(true),
     MCUseCFI(true),
-    MCUseDwarfDirectory(false) {
-  // Typically it will be subtargets that will adjust FloatABIType from Default
-  // to Soft or Hard.
-  if (UseSoftFloat)
-    FloatABIType = FloatABI::Soft;
+    MCUseDwarfDirectory(false),
+    Options(Options) {
 }
 
 TargetMachine::~TargetMachine() {
@@ -258,36 +107,3 @@ void TargetMachine::setDataSections(bool V) {
   DataSections = V;
 }
 
-namespace llvm {
-  /// DisableFramePointerElim - This returns true if frame pointer elimination
-  /// optimization should be disabled for the given machine function.
-  bool DisableFramePointerElim(const MachineFunction &MF) {
-    // Check to see if we should eliminate non-leaf frame pointers and then
-    // check to see if we should eliminate all frame pointers.
-    if (NoFramePointerElimNonLeaf && !NoFramePointerElim) {
-      const MachineFrameInfo *MFI = MF.getFrameInfo();
-      return MFI->hasCalls();
-    }
-
-    return NoFramePointerElim;
-  }
-
-  /// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
-  /// is specified on the command line.  When this flag is off(default), the
-  /// code generator is not allowed to generate mad (multiply add) if the
-  /// result is "less precise" than doing those operations individually.
-  bool LessPreciseFPMAD() { return UnsafeFPMath || LessPreciseFPMADOption; }
-
-  /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
-  /// that the rounding mode of the FPU can change from its default.
-  bool HonorSignDependentRoundingFPMath() {
-    return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
-  }
-
-  /// getTrapFunctionName - If this returns a non-empty string, this means isel
-  /// should lower Intrinsic::trap to a call to the specified function name
-  /// instead of an ISD::TRAP node.
-  StringRef getTrapFunctionName() {
-    return TrapFuncName;
-  }
-}
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 67239b8..2689837 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -13,8 +13,6 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
index 94aca7a..47489bb 100644
--- a/lib/Target/X86/AsmParser/CMakeLists.txt
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -5,12 +5,4 @@ add_llvm_library(LLVMX86AsmParser
   X86AsmParser.cpp
   )
 
-add_llvm_library_dependencies(LLVMX86AsmParser
-  LLVMMC
-  LLVMMCParser
-  LLVMSupport
-  LLVMX86Desc
-  LLVMX86Info
-  )
-
 add_dependencies(LLVMX86AsmParser X86CommonTableGen)
diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt
index 6c2405a..9f94d5d 100644
--- a/lib/Target/X86/AsmParser/LLVMBuild.txt
+++ b/lib/Target/X86/AsmParser/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = X86AsmParser
 parent = X86
 required_libraries = MC MCParser Support X86Desc X86Info
 add_to_library_groups = X86
-
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 4542d4b..be15899 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -51,21 +51,6 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
-add_llvm_library_dependencies(LLVMX86CodeGen
-  LLVMAnalysis
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  LLVMX86AsmPrinter
-  LLVMX86Desc
-  LLVMX86Info
-  LLVMX86Utils
-  )
-
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index 4f570d5..0cd6db9 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -5,12 +5,6 @@ add_llvm_library(LLVMX86Disassembler
   X86DisassemblerDecoder.c
   )
 
-add_llvm_library_dependencies(LLVMX86Disassembler
-  LLVMMC
-  LLVMSupport
-  LLVMX86Info
-  )
-
 # workaround for hanging compilation on MSVC9 and 10
 if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
 set_property(
diff --git a/lib/Target/X86/Disassembler/LLVMBuild.txt b/lib/Target/X86/Disassembler/LLVMBuild.txt
index cd748cf..cac7adf 100644
--- a/lib/Target/X86/Disassembler/LLVMBuild.txt
+++ b/lib/Target/X86/Disassembler/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = X86Disassembler
 parent = X86
 required_libraries = MC Support X86Info
 add_to_library_groups = X86
-
diff --git a/lib/Target/X86/InstPrinter/CMakeLists.txt b/lib/Target/X86/InstPrinter/CMakeLists.txt
index 2a2b5db..28e2460 100644
--- a/lib/Target/X86/InstPrinter/CMakeLists.txt
+++ b/lib/Target/X86/InstPrinter/CMakeLists.txt
@@ -6,10 +6,4 @@ add_llvm_library(LLVMX86AsmPrinter
   X86InstComments.cpp
   )
 
-add_llvm_library_dependencies(LLVMX86AsmPrinter
-  LLVMMC
-  LLVMSupport
-  LLVMX86Utils
-  )
-
 add_dependencies(LLVMX86AsmPrinter X86CommonTableGen)
diff --git a/lib/Target/X86/InstPrinter/LLVMBuild.txt b/lib/Target/X86/InstPrinter/LLVMBuild.txt
index fb01323..6868dde 100644
--- a/lib/Target/X86/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/X86/InstPrinter/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = X86AsmPrinter
 parent = X86
 required_libraries = MC Support X86Utils
 add_to_library_groups = X86
-
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 6e87efa..6e4b1b9 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -106,28 +106,92 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::PUNPCKHBWrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKHMask(16, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKHBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKHBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
     break;
   case X86::PUNPCKHWDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKHWDrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKHMask(8, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKHWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKHWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
     break;
   case X86::PUNPCKHDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKHDQrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKHMask(4, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKHDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKHDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
     break;
   case X86::PUNPCKHQDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKHQDQrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKHMask(2, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKHQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKHQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
     break;
 
   case X86::PUNPCKLBWrr:
@@ -135,42 +199,117 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::PUNPCKLBWrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLBWMask(16, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKLBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKLBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
     break;
   case X86::PUNPCKLWDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKLWDrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLWDMask(8, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKLWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKLWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
     break;
   case X86::PUNPCKLDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKLDQrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLDQMask(4, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKLDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKLDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
     break;
   case X86::PUNPCKLQDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKLQDQrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLQDQMask(2, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKLQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKLQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
     break;
 
   case X86::SHUFPDrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::SHUFPDrmi:
-    DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask);
+    DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VSHUFPDrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VSHUFPDrmi:
-    DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask);
+    DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPDYrmi:
+    DecodeSHUFPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -179,14 +318,25 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::SHUFPSrmi:
-    DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask);
+    DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VSHUFPSrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VSHUFPSrmi:
-    DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask);
+    DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPSYrmi:
+    DecodeSHUFPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -195,14 +345,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKLPDrm:
-    DecodeUNPCKLPMask(MVT::v2f64, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKLPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPDrm:
-    DecodeUNPCKLPMask(MVT::v2f64, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -210,7 +360,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPDYrm:
-    DecodeUNPCKLPMask(MVT::v4f64, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -218,14 +368,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKLPSrm:
-    DecodeUNPCKLPMask(MVT::v4f32, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKLPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPSrm:
-    DecodeUNPCKLPMask(MVT::v4f32, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -233,7 +383,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPSYrm:
-    DecodeUNPCKLPMask(MVT::v8f32, ShuffleMask);
+    DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -241,14 +391,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKHPDrm:
-    DecodeUNPCKHPMask(MVT::v2f64, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKHPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKHPDrm:
-    DecodeUNPCKHPMask(MVT::v2f64, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -256,7 +406,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKHPDYrm:
-    DecodeUNPCKLPMask(MVT::v4f64, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -264,14 +414,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKHPSrm:
-    DecodeUNPCKHPMask(MVT::v4f32, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKHPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKHPSrm:
-    DecodeUNPCKHPMask(MVT::v4f32, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -279,34 +429,52 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKHPSYrm:
-    DecodeUNPCKHPMask(MVT::v8f32, ShuffleMask);
+    DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERMILPSri:
-    DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
-                        ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSmi:
+    DecodeVPERMILPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERMILPSYri:
-    DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(),
-                        ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSYmi:
+    DecodeVPERMILPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERMILPDri:
-    DecodeVPERMILPDMask(2, MI->getOperand(2).getImm(),
-                        ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDmi:
+    DecodeVPERMILPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERMILPDYri:
-    DecodeVPERMILPDMask(4, MI->getOperand(2).getImm(),
-                        ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDYmi:
+    DecodeVPERMILPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERM2F128rr:
-    DecodeVPERM2F128Mask(MI->getOperand(3).getImm(), ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::VPERM2I128rr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPERM2F128rm:
+  case X86::VPERM2I128rm:
+    DecodeVPERM2F128Mask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                         ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   }
 
diff --git a/lib/Target/X86/LLVMBuild.txt b/lib/Target/X86/LLVMBuild.txt
index 514566c..87305e0 100644
--- a/lib/Target/X86/LLVMBuild.txt
+++ b/lib/Target/X86/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils
+
 [component_0]
 type = TargetGroup
 name = X86
@@ -30,4 +33,3 @@ name = X86CodeGen
 parent = X86
 required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target X86AsmPrinter X86Desc X86Info X86Utils
 add_to_library_groups = X86
-
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 264e791..ab2ebb4 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -6,13 +6,6 @@ add_llvm_library(LLVMX86Desc
   X86MachObjectWriter.cpp
   )
 
-add_llvm_library_dependencies(LLVMX86Desc
-  LLVMMC
-  LLVMSupport
-  LLVMX86AsmPrinter
-  LLVMX86Info
-  )
-
 add_dependencies(LLVMX86Desc X86CommonTableGen)
 
 # Hack: we need to include 'main' target directory to grab private headers
diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
index 3d09301..9e1d29c 100644
--- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = X86Desc
 parent = X86
 required_libraries = MC Support X86AsmPrinter X86Info
 add_to_library_groups = X86
-
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 69ad7d7..87b2b05 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -107,6 +107,11 @@ public:
 
   bool MayNeedRelaxation(const MCInst &Inst) const;
 
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCInstFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
   void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
 
   bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
@@ -244,6 +249,14 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
   return hasExp && !hasRIP;
 }
 
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCInstFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
 // FIXME: Can tblgen help at all here to verify there aren't other instructions
 // we can relax?
 void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index c50f785..662ac1d 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -304,6 +304,12 @@ namespace X86II {
     // TAXD - Prefix before and after 0x0F. Combination of TA and XD.
     TAXD = 19 << Op0Shift,
 
+    // XOP8 - Prefix to include use of imm byte.
+    XOP8 = 20 << Op0Shift,
+
+    // XOP9 - Prefix to exclude use of imm byte.
+    XOP9 = 21 << Op0Shift,
+
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
     // They are used to specify GPRs and SSE registers, 64-bit operand size,
@@ -418,7 +424,16 @@ namespace X86II {
     /// storing a classifier in the imm8 field.  To simplify our implementation,
     /// we handle this by storeing the classifier in the opcode field and using
     /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
-    Has3DNow0F0FOpcode = 1U << 7
+    Has3DNow0F0FOpcode = 1U << 7,
+
+    /// XOP_W - Same bit as VEX_W. Used to indicate swapping of
+    /// operand 3 and 4 to be encoded in ModRM or I8IMM. This is used
+    /// for FMA4 and XOP instructions.
+    XOP_W = 1U << 8,
+
+    /// XOP - Opcode prefix used by XOP instructions.
+    XOP = 1U << 9
+
   };
 
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
@@ -488,9 +503,12 @@ namespace X86II {
       return 0;
     case X86II::MRMSrcMem: {
       bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+      bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W;
       unsigned FirstMemOp = 1;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
+      if (HasXOP_W)
+        ++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
 
       // FIXME: Maybe lea should have its own form?  This is a horrible hack.
       //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 2703100..eb64ad1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -125,7 +125,19 @@ getNonexecutableStackSection(MCContext &Ctx) const {
                            0, SectionKind::getMetadata());
 }
 
-X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) {
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+  if (Triple.getArch() == Triple::x86_64) {
+    GlobalPrefix = "";
+    PrivateGlobalPrefix = ".L";
+  }
+
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+}
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
   if (Triple.getArch() == Triple::x86_64) {
     GlobalPrefix = "";
     PrivateGlobalPrefix = ".L";
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index 2cd4c8e..5d619e8 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -38,8 +38,12 @@ namespace llvm {
     virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
   };
 
-  struct X86MCAsmInfoCOFF : public MCAsmInfoCOFF {
-    explicit X86MCAsmInfoCOFF(const Triple &Triple);
+  struct X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+    explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+  };
+
+  struct X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+    explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
   };
 } // namespace llvm
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 1ab469c..8e14cb1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -169,23 +169,36 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
   return false;
 }
 
-/// StartsWithGlobalOffsetTable - Return true for the simple cases where this
-/// expression starts with _GLOBAL_OFFSET_TABLE_. This is a needed to support
-/// PIC on ELF i386 as that symbol is magic. We check only simple case that
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+///  _GLOBAL_OFFSET_TABLE_ and if it is of the form
+///  _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
 /// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
 /// of a binary expression.
-static bool StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+enum GlobalOffsetTableExprKind {
+  GOT_None,
+  GOT_Normal,
+  GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+  const MCExpr *RHS = 0;
   if (Expr->getKind() == MCExpr::Binary) {
     const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
     Expr = BE->getLHS();
+    RHS = BE->getRHS();
   }
 
   if (Expr->getKind() != MCExpr::SymbolRef)
-    return false;
+    return GOT_None;
 
   const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
   const MCSymbol &S = Ref->getSymbol();
-  return S.getName() == "_GLOBAL_OFFSET_TABLE_";
+  if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+    return GOT_None;
+  if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+    return GOT_SymDiff;
+  return GOT_Normal;
 }
 
 void X86MCCodeEmitter::
@@ -209,12 +222,15 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
 
   // If we have an immoffset, add it to the expression.
   if ((FixupKind == FK_Data_4 ||
-       FixupKind == MCFixupKind(X86::reloc_signed_4byte)) &&
-      StartsWithGlobalOffsetTable(Expr)) {
-    assert(ImmOffset == 0);
-
-    FixupKind = MCFixupKind(X86::reloc_global_offset_table);
-    ImmOffset = CurByte;
+       FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+    GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+    if (Kind != GOT_None) {
+      assert(ImmOffset == 0);
+
+      FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      if (Kind == GOT_Normal)
+        ImmOffset = CurByte;
+    }
   }
 
   // If the fixup is pc-relative, we need to bias the value to be relative to
@@ -415,6 +431,13 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   // opcode extension, or ignored, depending on the opcode byte)
   unsigned char VEX_W = 0;
 
+  // XOP_W: opcode specific, same bit as VEX_W, but used to
+  // swap operand 3 and 4 for FMA4 and XOP instructions
+  unsigned char XOP_W = 0;
+
+  // XOP: Use XOP prefix byte 0x8f instead of VEX.
+  unsigned char XOP = 0;
+
   // VEX_5M (VEX m-mmmmm field):
   //
   //  0b00000: Reserved for future use
@@ -422,7 +445,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b00010: implied 0F 38 leading opcode bytes
   //  0b00011: implied 0F 3A leading opcode bytes
   //  0b00100-0b11111: Reserved for future use
-  //
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b10001: XOP map select - 09h instructions with no imm byte
   unsigned char VEX_5M = 0x1;
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -453,6 +477,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
     VEX_W = 1;
 
+  if ((TSFlags >> X86II::VEXShift) & X86II::XOP_W)
+    XOP_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+    XOP = 1;
+
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
 
@@ -482,6 +512,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::XD:  // F2 0F
     VEX_PP = 0x3;
     break;
+  case X86II::XOP8:
+    VEX_5M = 0x8;
+    break;
+  case X86II::XOP9:
+    VEX_5M = 0x9;
+    break;
   case X86II::A6:  // Bypass: Not used by VEX
   case X86II::A7:  // Bypass: Not used by VEX
   case X86II::TB:  // Bypass: Not used by VEX
@@ -489,6 +525,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     break;  // No prefix!
   }
 
+
   // Set the vector length to 256-bit if YMM0-YMM15 is used
   for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
     if (!MI.getOperand(i).isReg())
@@ -529,6 +566,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  src1(ModR/M), MemAddr, imm8
     //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
     //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
     if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
       VEX_R = 0x0;
 
@@ -620,16 +660,16 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //
   unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
 
-  if (VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { // 2 byte VEX prefix
+  if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
     EmitByte(0xC5, CurByte, OS);
     EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
     return;
   }
 
   // 3 byte VEX prefix
-  EmitByte(0xC4, CurByte, OS);
+  EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
   EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
-  EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+  EmitByte(LastByte | ((VEX_W | XOP_W) << 7), CurByte, OS);
 }
 
 /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
@@ -889,6 +929,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // It uses the VEX.VVVV field?
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W;
+  unsigned XOP_W_I8IMMOperand = 2;
 
   // Determine where the memory operand starts, if present.
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
@@ -961,9 +1003,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       SrcRegNum++;
 
+    if(HasXOP_W) // Skip 2nd src (which is encoded in I8IMM)
+      SrcRegNum++;
+
     EmitRegModRMByte(MI.getOperand(SrcRegNum),
                      GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
-    CurOp = SrcRegNum + 1;
+
+    // 2 operands skipped with HasXOP_W, comensate accordingly
+    CurOp = HasXOP_W ? SrcRegNum : SrcRegNum + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
     break;
@@ -975,6 +1022,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++AddrOperands;
       ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
     }
+    if(HasXOP_W) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
 
     EmitByte(BaseOpcode, CurByte, OS);
 
@@ -1062,12 +1111,24 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // according to the right size for the instruction.
   if (CurOp != NumOps) {
     // The last source register of a 4 operand instruction in AVX is encoded
-    // in bits[7:4] of a immediate byte, and bits[3:0] are ignored.
+    // in bits[7:4] of a immediate byte.
     if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
-      const MCOperand &MO = MI.getOperand(CurOp++);
+      const MCOperand &MO = MI.getOperand(HasXOP_W ? XOP_W_I8IMMOperand
+                                                   : CurOp);
+      CurOp++;
       bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg());
       unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
       RegNum |= GetX86RegNum(MO) << 4;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if(CurOp != NumOps) {
+        const MCOperand &MIMM = MI.getOperand(CurOp++);
+        if(MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
       EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS,
                     Fixups);
     } else {
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index a843515..f2a34ed 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -361,8 +361,10 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
       MAI = new X86_64MCAsmInfoDarwin(TheTriple);
     else
       MAI = new X86MCAsmInfoDarwin(TheTriple);
-  } else if (TheTriple.isOSWindows()) {
-    MAI = new X86MCAsmInfoCOFF(TheTriple);
+  } else if (TheTriple.getOS() == Triple::Win32) {
+    MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+  } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) {
+    MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
   } else {
     MAI = new X86ELFMCAsmInfo(TheTriple);
   }
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 7d901af..a581993 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -922,16 +922,3 @@ _test2:                                 ## @test2
 The insertps's of $0 are pointless complex copies.
 
 //===---------------------------------------------------------------------===//
-
-If SSE4.1 is available we should inline rounding functions instead of emitting
-a libcall.
-
-floor: roundsd $0x01, %xmm, %xmm
-ceil:  roundsd $0x02, %xmm, %xmm
-
-and likewise for the single precision versions.
-
-Currently, SelectionDAGBuilder doesn't turn calls to these functions into the
-corresponding nodes and some targets (including X86) aren't ready for them.
-
-//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt
index 4da00fa..b1d0b9f 100644
--- a/lib/Target/X86/TargetInfo/CMakeLists.txt
+++ b/lib/Target/X86/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMX86Info
   X86TargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMX86Info
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMX86Info X86CommonTableGen)
diff --git a/lib/Target/X86/TargetInfo/LLVMBuild.txt b/lib/Target/X86/TargetInfo/LLVMBuild.txt
index ee015bd..3c64a22 100644
--- a/lib/Target/X86/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/X86/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = X86Info
 parent = X86
 required_libraries = MC Support Target
 add_to_library_groups = X86
-
diff --git a/lib/Target/X86/Utils/CMakeLists.txt b/lib/Target/X86/Utils/CMakeLists.txt
index caffd8b..2e72c34 100644
--- a/lib/Target/X86/Utils/CMakeLists.txt
+++ b/lib/Target/X86/Utils/CMakeLists.txt
@@ -4,9 +4,4 @@ add_llvm_library(LLVMX86Utils
   X86ShuffleDecode.cpp
   )
 
-add_llvm_library_dependencies(LLVMX86Utils
-  LLVMCore
-  LLVMSupport
-  )
-
 add_dependencies(LLVMX86Utils X86CommonTableGen)
diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt
index 3ee441e..de0a30f 100644
--- a/lib/Target/X86/Utils/LLVMBuild.txt
+++ b/lib/Target/X86/Utils/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = X86Utils
 parent = X86
 required_libraries = Core Support
 add_to_library_groups = X86
-
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index f6c9d7b..e7631b6 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -95,54 +95,31 @@ void DecodePSHUFLWMask(unsigned Imm,
   ShuffleMask.push_back(7);
 }
 
-void DecodePUNPCKLBWMask(unsigned NElts,
-                         SmallVectorImpl<unsigned> &ShuffleMask) {
-  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i8, NElts), ShuffleMask);
-}
-
-void DecodePUNPCKLWDMask(unsigned NElts,
-                         SmallVectorImpl<unsigned> &ShuffleMask) {
-  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i16, NElts), ShuffleMask);
-}
-
-void DecodePUNPCKLDQMask(unsigned NElts,
-                         SmallVectorImpl<unsigned> &ShuffleMask) {
-  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask);
-}
-
-void DecodePUNPCKLQDQMask(unsigned NElts,
-                          SmallVectorImpl<unsigned> &ShuffleMask) {
-  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask);
-}
-
-void DecodePUNPCKLMask(EVT VT,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
-  DecodeUNPCKLPMask(VT, ShuffleMask);
-}
+void DecodeSHUFPMask(EVT VT, unsigned Imm,
+                     SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
 
-void DecodePUNPCKHMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
-  for (unsigned i = 0; i != NElts/2; ++i) {
-    ShuffleMask.push_back(i+NElts/2);
-    ShuffleMask.push_back(i+NElts+NElts/2);
-  }
-}
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
 
-void DecodeSHUFPSMask(unsigned NElts, unsigned Imm,
-                      SmallVectorImpl<unsigned> &ShuffleMask) {
-  // Part that reads from dest.
-  for (unsigned i = 0; i != NElts/2; ++i) {
-    ShuffleMask.push_back(Imm % NElts);
-    Imm /= NElts;
-  }
-  // Part that reads from src.
-  for (unsigned i = 0; i != NElts/2; ++i) {
-    ShuffleMask.push_back(Imm % NElts + NElts);
-    Imm /= NElts;
+  int NewImm = Imm;
+  for (unsigned l = 0; l < NumLanes; ++l) {
+    unsigned LaneStart = l * NumLaneElts;
+    // Part that reads from dest.
+    for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+      ShuffleMask.push_back(NewImm % NumLaneElts + LaneStart);
+      NewImm /= NumLaneElts;
+    }
+    // Part that reads from src.
+    for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+      ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + LaneStart);
+      NewImm /= NumLaneElts;
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
   }
 }
 
-void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -161,10 +138,10 @@ void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
   }
 }
 
-/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// etc.  VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -183,36 +160,23 @@ void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
   }
 }
 
-// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit
-// elements. For 256-bit vectors, it's considered as two 128 lanes, the
-// referenced elements can't cross lanes and the mask of the first lane must
-// be the same of the second.
-void DecodeVPERMILPSMask(unsigned NumElts, unsigned Imm,
-                         SmallVectorImpl<unsigned> &ShuffleMask) {
-  unsigned NumLanes = (NumElts*32)/128;
-  unsigned LaneSize = NumElts/NumLanes;
-
-  for (unsigned l = 0; l != NumLanes; ++l) {
-    for (unsigned i = 0; i != LaneSize; ++i) {
-      unsigned Idx = (Imm >> (i*2)) & 0x3 ;
-      ShuffleMask.push_back(Idx+(l*LaneSize));
-    }
-  }
-}
+// DecodeVPERMILPMask - Decodes VPERMILPS/ VPERMILPD permutes for any 128-bit
+// 32-bit or 64-bit elements. For 256-bit vectors, it's considered as two 128
+// lanes. For VPERMILPS, referenced elements can't cross lanes and the mask of
+// the first lane must be the same of the second.
+void DecodeVPERMILPMask(EVT VT, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
 
-// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit
-// elements. For 256-bit vectors, it's considered as two 128 lanes, the
-// referenced elements can't cross lanes but the mask of the first lane can
-// be the different of the second (not like VPERMILPS).
-void DecodeVPERMILPDMask(unsigned NumElts, unsigned Imm,
-                         SmallVectorImpl<unsigned> &ShuffleMask) {
-  unsigned NumLanes = (NumElts*64)/128;
-  unsigned LaneSize = NumElts/NumLanes;
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
 
-  for (unsigned l = 0; l < NumLanes; ++l) {
-    for (unsigned i = l*LaneSize; i < LaneSize*(l+1); ++i) {
-      unsigned Idx = (Imm >> i) & 0x1;
-      ShuffleMask.push_back(Idx+(l*LaneSize));
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    unsigned LaneStart = l*NumLaneElts;
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Idx = NumLaneElts == 4 ? (Imm >> (i*2)) & 0x3
+                                      : (Imm >> (i+LaneStart)) & 0x1;
+      ShuffleMask.push_back(Idx+LaneStart);
     }
   }
 }
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 35f6530..243728f 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -46,50 +46,25 @@ void DecodePSHUFHWMask(unsigned Imm,
 void DecodePSHUFLWMask(unsigned Imm,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
-void DecodePUNPCKLBWMask(unsigned NElts,
-                         SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodePUNPCKLWDMask(unsigned NElts,
-                         SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodePUNPCKLDQMask(unsigned NElts,
-                         SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodePUNPCKLQDQMask(unsigned NElts,
-                          SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodePUNPCKLMask(EVT VT,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodePUNPCKHMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodeSHUFPSMask(unsigned NElts, unsigned Imm,
-                      SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeSHUFPMask(EVT VT, unsigned Imm,
+                     SmallVectorImpl<unsigned> &ShuffleMask);
 
-/// DecodeUNPCKHPMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// etc.  VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
 
-/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// etc.  VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
-
+void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
 
-// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit
-// elements. For 256-bit vectors, it's considered as two 128 lanes, the
-// referenced elements can't cross lanes and the mask of the first lane must
-// be the same of the second.
-void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
-                        SmallVectorImpl<unsigned> &ShuffleMask);
 
-// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit
-// elements. For 256-bit vectors, it's considered as two 128 lanes, the
-// referenced elements can't cross lanes but the mask of the first lane can
-// be the different of the second (not like VPERMILPS).
-void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
+// DecodeVPERMILPMask - Decodes VPERMILPS/ VPERMILPD permutes for any 128-bit
+// 32-bit or 64-bit elements. For 256-bit vectors, it's considered as two 128
+// lanes. For VPERMILPS, referenced elements can't cross lanes and the mask of
+// the first lane must be the same of the second.
+void DecodeVPERMILPMask(EVT VT, unsigned Imm,
                         SmallVectorImpl<unsigned> &ShuffleMask);
 
 void DecodeVPERM2F128Mask(unsigned Imm,
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 62a7016..8229ca5 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -91,6 +91,8 @@ def FeatureFMA3    : SubtargetFeature<"fma3", "HasFMA3", "true",
                                      "Enable three-operand fused multiple-add">;
 def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
                                       "Enable four-operand fused multiple-add">;
+def FeatureXOP    : SubtargetFeature<"xop", "HasXOP", "true",
+                                      "Enable XOP instructions">;
 def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
                                           "HasVectorUAMem", "true",
                  "Allow unaligned memory operands on vector/SIMD instructions">;
@@ -194,14 +196,16 @@ def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
 def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A,
-                               Feature3DNowA, FeatureCMPXCHG16B,
+                               Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
                                FeatureSlowBTMem]>;
-def : Proc<"barcelona",       [FeatureSSE3,   FeatureSSE4A,
-                               Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
-def : Proc<"istanbul",        [Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSSE4A]>;
-def : Proc<"shanghai",        [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A]>;
+// FIXME: Disabling AVX for now since it's not ready.
+def : Proc<"bdver1",          [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
+                               FeatureAES, FeatureCLMUL, FeatureFMA4,
+                               FeatureXOP, FeatureLZCNT]>;
+def : Proc<"bdver2",          [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
+                               FeatureAES, FeatureCLMUL, FeatureFMA4,
+                                FeatureXOP, FeatureF16C, FeatureLZCNT,
+                                 FeatureBMI]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
 def : Proc<"winchip2",        [Feature3DNow]>;
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 77b9905..aab2a05 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -158,10 +158,15 @@ def CC_X86_64_C : CallingConv<[
             CCIfSubtarget<"hasXMM()",
             CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
 
-  // The first 8 256-bit vector arguments are passed in YMM registers.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-            CCIfSubtarget<"hasAVX()",
-            CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>,
+  // The first 8 256-bit vector arguments are passed in YMM registers, unless
+  // this is a vararg function.
+  // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+  // fixed arguments to vararg functions are supposed to be passed in
+  // registers.  Actually modeling that would be a lot of work, though.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          CCIfSubtarget<"hasAVX()",
+                          CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+                                         YMM4, YMM5, YMM6, YMM7]>>>>,
 
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index ba615a8..ed16e88 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -1004,7 +1004,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     break;
   }
 
-  if (!Desc->isVariadic() && CurOp != NumOps) {
+  if (!MI.isVariadic() && CurOp != NumOps) {
 #ifndef NDEBUG
     dbgs() << "Cannot encode all operands of: " << MI << "\n";
 #endif
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 32f1770..1589439 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -728,7 +728,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
-  if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
     return false;
 
   // Let SDISel handle vararg functions.
@@ -1529,7 +1529,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
-  if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
     return false;
 
   PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
@@ -1543,7 +1543,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   // Fast-isel doesn't know about callee-pop yet.
   if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg,
-                       GuaranteedTailCallOpt))
+                       TM.Options.GuaranteedTailCallOpt))
     return false;
 
   // Check whether the function can return without sret-demotion.
@@ -2121,7 +2121,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
     default: return false;
     case MVT::f32:
       if (X86ScalarSSEf32) {
-        Opc = Subtarget->hasAVX() ? X86::VFsFLD0SS : X86::FsFLD0SS;
+        Opc = X86::FsFLD0SS;
         RC  = X86::FR32RegisterClass;
       } else {
         Opc = X86::LD_Fp032;
@@ -2130,7 +2130,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
       break;
     case MVT::f64:
       if (X86ScalarSSEf64) {
-        Opc = Subtarget->hasAVX() ? X86::VFsFLD0SD : X86::FsFLD0SD;
+        Opc = X86::FsFLD0SD;
         RC  = X86::FR64RegisterClass;
       } else {
         Opc = X86::LD_Fp064;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 819d242..6a40cc1 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -47,7 +47,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineModuleInfo &MMI = MF.getMMI();
   const TargetRegisterInfo *RI = TM.getRegisterInfo();
 
-  return (DisableFramePointerElim(MF) ||
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           RI->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken() ||
@@ -210,7 +210,7 @@ static
 void mergeSPUpdatesDown(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator &MBBI,
                         unsigned StackPtr, uint64_t *NumBytes = NULL) {
-  // FIXME: THIS ISN'T RUN!!!
+  // FIXME:  THIS ISN'T RUN!!!
   return;
 
   if (MBBI == MBB.end()) return;
@@ -351,20 +351,22 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
 /// register. The number corresponds to the enum lists in
 /// compact_unwind_encoding.h.
 static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) {
-  int Idx = 1;
-  for (; *CURegs; ++CURegs, ++Idx)
+  for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
     if (*CURegs == Reg)
       return Idx;
 
   return -1;
 }
 
+// Number of registers that can be saved in a compact unwind encoding.
+#define CU_NUM_SAVED_REGS 6
+
 /// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
 /// used with frameless stacks. It is passed the number of registers to be saved
 /// and an array of the registers saved.
-static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6],
-                                                         unsigned RegCount,
-                                                         bool Is64Bit) {
+static uint32_t
+encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
+                                         unsigned RegCount, bool Is64Bit) {
   // The saved registers are numbered from 1 to 6. In order to encode the order
   // in which they were saved, we re-number them according to their place in the
   // register order. The re-numbering is relative to the last re-numbered
@@ -385,14 +387,21 @@ static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6],
   };
   const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
 
-  uint32_t RenumRegs[6];
-  for (unsigned i = 6 - RegCount; i < 6; ++i) {
+  for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
     int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
     if (CUReg == -1) return ~0U;
     SavedRegs[i] = CUReg;
+  }
+
+  // Reverse the list.
+  std::swap(SavedRegs[0], SavedRegs[5]);
+  std::swap(SavedRegs[1], SavedRegs[4]);
+  std::swap(SavedRegs[2], SavedRegs[3]);
 
+  uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+  for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) {
     unsigned Countless = 0;
-    for (unsigned j = 6 - RegCount; j < i; ++j)
+    for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
       if (SavedRegs[j] < SavedRegs[i])
         ++Countless;
 
@@ -435,8 +444,9 @@ static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6],
 
 /// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
 /// compact encoding with a frame pointer.
-static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6],
-                                                      bool Is64Bit) {
+static uint32_t
+encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
+                                      bool Is64Bit) {
   static const unsigned CU32BitRegs[] = {
     X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
   };
@@ -448,13 +458,16 @@ static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6],
   // Encode the registers in the order they were saved, 3-bits per register. The
   // registers are numbered from 1 to 6.
   uint32_t RegEnc = 0;
-  for (int I = 5; I >= 0; --I) {
+  for (int I = 0; I != 6; --I) {
     unsigned Reg = SavedRegs[I];
     if (Reg == 0) break;
     int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
     if (CURegNum == -1)
       return ~0U;
-    RegEnc |= (CURegNum & 0x7) << (5 - I);
+
+    // Encode the 3-bit register number in order, skipping over 3-bits for each
+    // register.
+    RegEnc |= (CURegNum & 0x7) << ((5 - I) * 3);
   }
 
   assert((RegEnc & 0x7FFF) == RegEnc && "Invalid compact register encoding!");
@@ -466,14 +479,11 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned StackPtr = RegInfo->getStackRegister();
 
-  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-
   bool Is64Bit = STI.is64Bit();
   bool HasFP = hasFP(MF);
 
-  unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 };
-  int SavedRegIdx = 6;
+  unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 };
+  unsigned SavedRegIdx = 0;
 
   unsigned OffsetSize = (Is64Bit ? 8 : 4);
 
@@ -481,14 +491,13 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
   unsigned PushInstrSize = 1;
   unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
   unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
-  unsigned SubtractInstr = getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta);
   unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
 
   unsigned StackDivide = (Is64Bit ? 8 : 4);
 
   unsigned InstrOffset = 0;
-  unsigned CFAOffset = 0;
   unsigned StackAdjust = 0;
+  unsigned StackSize = 0;
 
   MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
   bool ExpectEnd = false;
@@ -504,10 +513,10 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
 
     if (Opc == PushInstr) {
       // If there are too many saved registers, we cannot use compact encoding.
-      if (--SavedRegIdx < 0) return 0;
+      if (SavedRegIdx >= CU_NUM_SAVED_REGS) return 0;
 
-      SavedRegs[SavedRegIdx] = MI.getOperand(0).getReg();
-      CFAOffset += OffsetSize;
+      SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
+      StackAdjust += OffsetSize;
       InstrOffset += PushInstrSize;
     } else if (Opc == MoveInstr) {
       unsigned SrcReg = MI.getOperand(1).getReg();
@@ -516,13 +525,14 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
       if (DstReg != FramePtr || SrcReg != StackPtr)
         return 0;
 
-      CFAOffset = 0;
+      StackAdjust = 0;
       memset(SavedRegs, 0, sizeof(SavedRegs));
-      SavedRegIdx = 6;
+      SavedRegIdx = 0;
       InstrOffset += MoveInstrSize;
-    } else if (Opc == SubtractInstr) {
-      if (StackAdjust)
-        // We all ready have a stack pointer adjustment.
+    } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) {
+      if (StackSize)
+        // We already have a stack size.
         return 0;
 
       if (!MI.getOperand(0).isReg() ||
@@ -533,7 +543,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
         //   %RSP<def> = SUB64ri8 %RSP, 48
         return 0;
 
-      StackAdjust = MI.getOperand(2).getImm() / StackDivide;
+      StackSize = MI.getOperand(2).getImm() / StackDivide;
       SubtractInstrIdx += InstrOffset;
       ExpectEnd = true;
     }
@@ -541,28 +551,30 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
 
   // Encode that we are using EBP/RBP as the frame pointer.
   uint32_t CompactUnwindEncoding = 0;
-  CFAOffset /= StackDivide;
+  StackAdjust /= StackDivide;
   if (HasFP) {
-    if ((CFAOffset & 0xFF) != CFAOffset)
+    if ((StackAdjust & 0xFF) != StackAdjust)
       // Offset was too big for compact encoding.
       return 0;
 
     // Get the encoding of the saved registers when we have a frame pointer.
     uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
-    if (RegEnc == ~0U)
-      return 0;
+    if (RegEnc == ~0U) return 0;
 
     CompactUnwindEncoding |= 0x01000000;
-    CompactUnwindEncoding |= (CFAOffset & 0xFF) << 16;
+    CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
     CompactUnwindEncoding |= RegEnc & 0x7FFF;
   } else {
-    unsigned FullOffset = CFAOffset + StackAdjust;
-    if ((FullOffset & 0xFF) == FullOffset) {
-      // Frameless stack.
+    ++StackAdjust;
+    uint32_t TotalStackSize = StackAdjust + StackSize;
+    if ((TotalStackSize & 0xFF) == TotalStackSize) {
+      // Frameless stack with a small stack size.
       CompactUnwindEncoding |= 0x02000000;
-      CompactUnwindEncoding |= (FullOffset & 0xFF) << 16;
+
+      // Encode the stack size.
+      CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16;
     } else {
-      if ((CFAOffset & 0x7) != CFAOffset)
+      if ((StackAdjust & 0x7) != StackAdjust)
         // The extra stack adjustments are too big for us to handle.
         return 0;
 
@@ -573,16 +585,21 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
       // instruction.
       CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
 
-      // Encode any extra stack stack changes (done via push instructions).
-      CompactUnwindEncoding |= (CFAOffset & 0x7) << 13;
+      // Encode any extra stack stack adjustments (done via push instructions).
+      CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
     }
 
+    // Encode the number of registers saved.
+    CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
     // Get the encoding of the saved registers when we don't have a frame
     // pointer.
-    uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegs,
-                                                               6 - SavedRegIdx,
-                                                               Is64Bit);
+    uint32_t RegEnc =
+      encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx,
+                                               Is64Bit);
     if (RegEnc == ~0U) return 0;
+
+    // Encode the register encoding.
     CompactUnwindEncoding |= RegEnc & 0x3FF;
   }
 
@@ -638,10 +655,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // stack pointer (we fit in the Red Zone).
   if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) &&
       !RegInfo->needsStackRealignment(MF) &&
-      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
-      !MFI->adjustsStack() &&                      // No calls.
-      !IsWin64 &&                                  // Win64 has no Red Zone
-      !EnableSegmentedStacks) {                    // Regular stack
+      !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
+      !MFI->adjustsStack() &&                           // No calls.
+      !IsWin64 &&                                       // Win64 has no Red Zone
+      !MF.getTarget().Options.EnableSegmentedStacks) {  // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -978,7 +995,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     unsigned Opc = PI->getOpcode();
 
     if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
-        !PI->getDesc().isTerminator())
+        !PI->isTerminator())
       break;
 
     --MBBI;
@@ -1306,6 +1323,10 @@ GetScratchRegister(bool Is64Bit, const MachineFunction &MF) {
   }
 }
 
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
 void
 X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MachineBasicBlock &prologueMBB = MF.front();
@@ -1360,16 +1381,24 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
     TlsReg = X86::FS;
     TlsOffset = 0x70;
 
-    BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
-      .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+    if (StackSize < kSplitStackAvailable)
+      ScratchReg = X86::RSP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
+        .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+
     BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
       .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   } else {
     TlsReg = X86::GS;
     TlsOffset = 0x30;
 
-    BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
-      .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+    if (StackSize < kSplitStackAvailable)
+      ScratchReg = X86::ESP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+        .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+
     BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
       .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   }
@@ -1394,9 +1423,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
     MF.getRegInfo().setPhysRegUsed(X86::R10);
     MF.getRegInfo().setPhysRegUsed(X86::R11);
   } else {
-    // Since we'll call __morestack, stack alignment needs to be preserved.
-    BuildMI(allocMBB, DL, TII.get(X86::SUB32ri), X86::ESP).addReg(X86::ESP)
-      .addImm(8);
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
@@ -1411,11 +1437,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
     BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
       .addExternalSymbol("__morestack");
 
-  // __morestack only seems to remove 8 bytes off the stack. Add back the
-  // additional 8 bytes we added before pushing the arguments.
-  if (!Is64Bit)
-    BuildMI(allocMBB, DL, TII.get(X86::ADD32ri), X86::ESP).addReg(X86::ESP)
-      .addImm(8);
   if (IsNested)
     BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
   else
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 96c6f41..03727a2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -256,7 +256,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
-  } else if (!UseSoftFloat) {
+  } else if (!TM.Options.UseSoftFloat) {
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
@@ -270,7 +270,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 
-  if (!UseSoftFloat) {
+  if (!TM.Options.UseSoftFloat) {
     // SSE has no i16 to fp conversion, only i32
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
@@ -313,7 +313,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
-  } else if (!UseSoftFloat) {
+  } else if (!TM.Options.UseSoftFloat) {
     // Since AVX is a superset of SSE3, only check for SSE here.
     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
       // Expand FP_TO_UINT into a select.
@@ -378,6 +378,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i16  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i64  , Expand);
   if (Subtarget->hasBMI()) {
     setOperationAction(ISD::CTTZ           , MVT::i8   , Promote);
   } else {
@@ -388,6 +392,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
   }
 
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i8   , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i16  , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i64  , Expand);
   if (Subtarget->hasLZCNT()) {
     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
   } else {
@@ -537,14 +545,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
-  else if (EnableSegmentedStacks)
+  else if (TM.Options.EnableSegmentedStacks)
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Expand);
 
-  if (!UseSoftFloat && X86ScalarSSEf64) {
+  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
@@ -576,7 +584,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // cases we handle.
     addLegalFPImmediate(APFloat(+0.0)); // xorpd
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
-  } else if (!UseSoftFloat && X86ScalarSSEf32) {
+  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
@@ -605,11 +613,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 
-    if (!UnsafeFPMath) {
+    if (!TM.Options.UnsafeFPMath) {
       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
     }
-  } else if (!UseSoftFloat) {
+  } else if (!TM.Options.UseSoftFloat) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
@@ -620,7 +628,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 
-    if (!UnsafeFPMath) {
+    if (!TM.Options.UnsafeFPMath) {
       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
     }
@@ -639,7 +647,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
   // Long double always uses X87.
-  if (!UseSoftFloat) {
+  if (!TM.Options.UseSoftFloat) {
     addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -658,11 +666,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
     }
 
-    if (!UnsafeFPMath) {
+    if (!TM.Options.UnsafeFPMath) {
       setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
     }
 
+    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
   }
 
@@ -714,7 +727,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
@@ -748,7 +763,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
   // with -msoft-float, disable use of MMX as well.
-  if (!UseSoftFloat && Subtarget->hasMMX()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
     addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
     // No operations on x86mmx supported, everything uses intrinsics.
   }
@@ -785,7 +800,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 
-  if (!UseSoftFloat && Subtarget->hasXMM()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) {
     addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
 
     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
@@ -802,7 +817,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
   }
 
-  if (!UseSoftFloat && Subtarget->hasXMMInt()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) {
     addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
 
     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
@@ -983,7 +998,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->hasSSE42orAVX())
     setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
 
-  if (!UseSoftFloat && Subtarget->hasAVX()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
     addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
     addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
     addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
@@ -1211,10 +1226,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
-  setPrefLoopAlignment(16);
+  setPrefLoopAlignment(4); // 2^4 bytes.
   benefitFromCodePlacementOpt = true;
 
-  setPrefFunctionAlignment(4);
+  setPrefFunctionAlignment(4); // 2^4 bytes.
 }
 
 
@@ -1709,7 +1724,8 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 
 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
 /// a tailcall target by changing its ABI.
-static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
+                                   bool GuaranteedTailCallOpt) {
   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
 }
 
@@ -1723,7 +1739,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
-  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
+  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
+                              getTargetMachine().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
 
@@ -1873,7 +1890,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   unsigned StackSize = CCInfo.getNextStackOffset();
   // Align stack specially for tail calls.
-  if (FuncIsMadeTailCallSafe(CallConv))
+  if (FuncIsMadeTailCallSafe(CallConv,
+                             MF.getTarget().Options.GuaranteedTailCallOpt))
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
   // If the function takes variable number of arguments, make a frame index for
@@ -1918,9 +1936,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
       assert(!(NumXMMRegs && !Subtarget->hasXMM()) &&
              "SSE register cannot be used when SSE is disabled!");
-      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
+      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
+               NoImplicitFloatOps) &&
              "SSE register cannot be used when SSE is disabled!");
-      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM())
+      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+          !Subtarget->hasXMM())
         // Kernel mode asks for SSE to be disabled, so don't push them
         // on the stack.
         TotalNumXMMRegs = 0;
@@ -1998,7 +2018,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   }
 
   // Some CCs need callee pop.
-  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) {
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
@@ -2098,7 +2119,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
-    if (!GuaranteedTailCallOpt && isTailCall)
+    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
       IsSibcall = true;
 
     if (isTailCall)
@@ -2126,7 +2147,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
-  else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
+  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+           IsTailCallConvention(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
@@ -2305,7 +2327,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     int FI = 0;
     // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
-    if (GuaranteedTailCallOpt) {
+    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         if (VA.isRegLoc())
@@ -2485,7 +2507,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPush;
-  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt))
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       getTargetMachine().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
     // If this is a call to a struct-return function, the callee
@@ -2643,7 +2666,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
-  if (GuaranteedTailCallOpt) {
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) && CCMatch)
       return true;
     return false;
@@ -2843,23 +2866,10 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case X86ISD::UNPCKLPS:
-  case X86ISD::UNPCKLPD:
-  case X86ISD::PUNPCKLWD:
-  case X86ISD::PUNPCKLBW:
-  case X86ISD::PUNPCKLDQ:
-  case X86ISD::PUNPCKLQDQ:
-  case X86ISD::UNPCKHPS:
-  case X86ISD::UNPCKHPD:
-  case X86ISD::PUNPCKHWD:
-  case X86ISD::PUNPCKHBW:
-  case X86ISD::PUNPCKHDQ:
-  case X86ISD::PUNPCKHQDQ:
-  case X86ISD::VPERMILPS:
-  case X86ISD::VPERMILPSY:
-  case X86ISD::VPERMILPD:
-  case X86ISD::VPERMILPDY:
-  case X86ISD::VPERM2F128:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
     return true;
   }
   return false;
@@ -2885,10 +2895,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
-  case X86ISD::VPERMILPS:
-  case X86ISD::VPERMILPSY:
-  case X86ISD::VPERMILPD:
-  case X86ISD::VPERMILPDY:
+  case X86ISD::VPERMILP:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
 
@@ -2902,7 +2909,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PALIGN:
   case X86ISD::SHUFPD:
   case X86ISD::SHUFPS:
-  case X86ISD::VPERM2F128:
+  case X86ISD::VPERM2X128:
     return DAG.getNode(Opc, dl, VT, V1, V2,
                        DAG.getConstant(TargetMask, MVT::i8));
   }
@@ -2920,18 +2927,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::MOVLPD:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case X86ISD::UNPCKLPS:
-  case X86ISD::UNPCKLPD:
-  case X86ISD::PUNPCKLWD:
-  case X86ISD::PUNPCKLBW:
-  case X86ISD::PUNPCKLDQ:
-  case X86ISD::PUNPCKLQDQ:
-  case X86ISD::UNPCKHPS:
-  case X86ISD::UNPCKHPD:
-  case X86ISD::PUNPCKHWD:
-  case X86ISD::PUNPCKHBW:
-  case X86ISD::PUNPCKHDQ:
-  case X86ISD::PUNPCKHQDQ:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
     return DAG.getNode(Opc, dl, VT, V1, V2);
   }
   return SDValue();
@@ -3231,7 +3228,7 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
 static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
                           bool hasSSSE3OrAVX) {
   int i, e = VT.getVectorNumElements();
-  if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+  if (VT.getSizeInBits() != 128)
     return false;
 
   // Do not handle v2i64 / v2f64 shuffles with palignr.
@@ -3261,17 +3258,17 @@ static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
   return true;
 }
 
-/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// VSHUFPSY.
-static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          const X86Subtarget *Subtarget) {
+static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                          bool HasAVX, bool Commuted = false) {
   int NumElems = VT.getVectorNumElements();
 
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+  if (!HasAVX || VT.getSizeInBits() != 256)
     return false;
 
-  if (NumElems != 8)
+  if (NumElems != 4 && NumElems != 8)
     return false;
 
   // VSHUFPSY divides the resulting vector into 4 chunks.
@@ -3284,124 +3281,63 @@ static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
   //
-  int QuarterSize = NumElems/4;
-  int HalfSize = QuarterSize*2;
-  for (int i = 0; i < QuarterSize; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
-      return false;
-  for (int i = QuarterSize; i < QuarterSize*2; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
-      return false;
-
-  // The mask of the second half must be the same as the first but with
-  // the appropriate offsets. This works in the same way as VPERMILPS
-  // works with masks.
-  for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
-    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
-      return false;
-    int FstHalfIdx = i-HalfSize;
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
-  }
-  for (int i = QuarterSize*3; i < NumElems; ++i) {
-    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
-      return false;
-    int FstHalfIdx = i-HalfSize;
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
-
-  }
-
-  return true;
-}
-
-/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPSY instruction.
-static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
-
-  assert(NumElems == 8 && VT.getSizeInBits() == 256 &&
-         "Only supports v8i32 and v8f32 types");
-
-  int HalfSize = NumElems/2;
-  unsigned Mask = 0;
-  for (int i = 0; i != NumElems ; ++i) {
-    if (SVOp->getMaskElt(i) < 0)
-      continue;
-    // The mask of the first half must be equal to the second one.
-    unsigned Shamt = (i%HalfSize)*2;
-    unsigned Elt = SVOp->getMaskElt(i) % HalfSize;
-    Mask |= Elt << Shamt;
-  }
-
-  return Mask;
-}
-
-/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS
-/// version and the mask of the second half isn't binded with the first
-/// one.
-static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                           const X86Subtarget *Subtarget) {
-  int NumElems = VT.getVectorNumElements();
-
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
-    return false;
-
-  if (NumElems != 4)
-    return false;
-
-  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // VSHUFPDY divides the resulting vector into 4 chunks.
   // The sources are also splitted into 4 chunks, and each destination
   // chunk must come from a different source chunk.
   //
   //  SRC1 =>      X3       X2       X1       X0
   //  SRC2 =>      Y3       Y2       Y1       Y0
   //
-  //  DST  =>  Y2..Y3,  X2..X3,  Y1..Y0,  X1..X0
+  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   //
-  int QuarterSize = NumElems/4;
-  int HalfSize = QuarterSize*2;
-  for (int i = 0; i < QuarterSize; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
-      return false;
-  for (int i = QuarterSize; i < QuarterSize*2; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
-      return false;
-  for (int i = QuarterSize*2; i < QuarterSize*3; ++i)
-    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
-      return false;
-  for (int i = QuarterSize*3; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
-      return false;
+  unsigned QuarterSize = NumElems/4;
+  unsigned HalfSize = QuarterSize*2;
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned LaneStart = l*HalfSize;
+    for (unsigned s = 0; s != 2; ++s) {
+      unsigned QuarterStart = s*QuarterSize;
+      unsigned Src = (Commuted) ? (1-s) : s;
+      unsigned SrcStart = Src*NumElems + LaneStart;
+      for (unsigned i = 0; i != QuarterSize; ++i) {
+        int Idx = Mask[i+QuarterStart+LaneStart];
+        if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize))
+          return false;
+        // For VSHUFPSY, the mask of the second half must be the same as the first
+        // but with the appropriate offsets. This works in the same way as
+        // VPERMILPS works with masks.
+        if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0)
+          continue;
+        if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize))
+          return false;
+      }
+    }
+  }
 
   return true;
 }
 
-/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPDY instruction.
-static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
+/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions.
+static unsigned getShuffleVSHUFPYImmediate(SDNode *N) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   EVT VT = SVOp->getValueType(0);
   int NumElems = VT.getVectorNumElements();
 
-  assert(NumElems == 4 && VT.getSizeInBits() == 256 &&
-         "Only supports v4i64 and v4f64 types");
+  assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types");
+  assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types");
 
   int HalfSize = NumElems/2;
+  unsigned Mul = (NumElems == 8) ? 2 : 1;
   unsigned Mask = 0;
-  for (int i = 0; i != NumElems ; ++i) {
-    if (SVOp->getMaskElt(i) < 0)
+  for (int i = 0; i != NumElems; ++i) {
+    int Elt = SVOp->getMaskElt(i);
+    if (Elt < 0)
       continue;
-    int Elt = SVOp->getMaskElt(i) % HalfSize;
-    Mask |= Elt << i;
+    Elt %= HalfSize;
+    unsigned Shamt = i;
+    // For VSHUFPSY, the mask of the first half must be equal to the second one.
+    if (NumElems == 8) Shamt %= HalfSize;
+    Mask |= Elt << (Shamt*Mul);
   }
 
   return Mask;
@@ -3409,8 +3345,8 @@ static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
 
 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
 /// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
+                                     unsigned NumElems) {
   for (unsigned i = 0; i != NumElems; ++i) {
     int idx = Mask[i];
     if (idx < 0)
@@ -3422,31 +3358,13 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
   }
 }
 
-/// isCommutedVSHUFP() - Return true if swapping operands will 
-///  allow to use the "vshufpd" or "vshufps" instruction 
-///  for 256-bit vectors
-static bool isCommutedVSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                               const X86Subtarget *Subtarget) {
-
-  unsigned NumElems = VT.getVectorNumElements();
-  if ((VT.getSizeInBits() != 256) || ((NumElems != 4) && (NumElems != 8)))
-    return false;
-
-  SmallVector<int, 8> CommutedMask;
-  for (unsigned i = 0; i < NumElems; ++i)
-    CommutedMask.push_back(Mask[i]);
-
-  CommuteVectorShuffleMask(CommutedMask, VT);
-  return (NumElems == 4) ? isVSHUFPDYMask(CommutedMask, VT, Subtarget):
-      isVSHUFPSYMask(CommutedMask, VT, Subtarget);
-}
-
-
 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 128-bit
-/// SHUFPS and SHUFPD.
-static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
+/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
+/// reverse of what x86 shuffles want.
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                        bool Commuted = false) {
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (VT.getSizeInBits() != 128)
     return false;
@@ -3454,12 +3372,14 @@ static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (NumElems != 2 && NumElems != 4)
     return false;
 
-  int Half = NumElems / 2;
-  for (int i = 0; i < Half; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, NumElems))
+  unsigned Half = NumElems / 2;
+  unsigned SrcStart = Commuted ? NumElems : 0;
+  for (unsigned i = 0; i != Half; ++i)
+    if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
       return false;
-  for (int i = Half; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+  SrcStart = Commuted ? 0 : NumElems;
+  for (unsigned i = Half; i != NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
       return false;
 
   return true;
@@ -3471,32 +3391,6 @@ bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
   return ::isSHUFPMask(M, N->getValueType(0));
 }
 
-/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
-/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
-/// half elements to come from vector 1 (which would equal the dest.) and
-/// the upper half to come from vector 2.
-static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 2 && NumElems != 4)
-    return false;
-
-  int Half = NumElems / 2;
-  for (int i = 0; i < Half; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
-      return false;
-  for (int i = Half; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, NumElems))
-      return false;
-  return true;
-}
-
-static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return isCommutedSHUFPMask(M, N->getValueType(0));
-}
-
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
 bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
@@ -3765,15 +3659,15 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
   return ::isMOVLMask(M, N->getValueType(0));
 }
 
-/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
 /// as permutations between 128-bit chunks or halves. As an example: this
 /// shuffle bellow:
 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
-static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
-                             const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                             bool HasAVX) {
+  if (!HasAVX || VT.getSizeInBits() != 256)
     return false;
 
   // The shuffle result is divided into half A and half B. In total the two
@@ -3801,10 +3695,9 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
   return MatchA && MatchB;
 }
 
-/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
-static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
+static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   EVT VT = SVOp->getValueType(0);
 
   int HalfSize = VT.getVectorNumElements()/2;
@@ -3826,81 +3719,47 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
   return (FstHalf | (SndHalf << 4));
 }
 
-/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
 /// Note that VPERMIL mask matching is different depending whether theunderlying
 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
 /// to the same elements of the low, but to the higher half of the source.
 /// In VPERMILPD the two lanes could be shuffled independently of each other
 /// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                            const X86Subtarget *Subtarget) {
+static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                           bool HasAVX) {
   int NumElts = VT.getVectorNumElements();
   int NumLanes = VT.getSizeInBits()/128;
 
-  if (!Subtarget->hasAVX())
+  if (!HasAVX)
     return false;
 
-  // Only match 256-bit with 64-bit types
-  if (VT.getSizeInBits() != 256 || NumElts != 4)
+  // Only match 256-bit with 32/64-bit types
+  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
     return false;
 
-  // The mask on the high lane is independent of the low. Both can match
-  // any element in inside its own lane, but can't cross.
   int LaneSize = NumElts/NumLanes;
-  for (int l = 0; l < NumLanes; ++l)
-    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
-      int LaneStart = l*LaneSize;
-      if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+  for (int l = 0; l != NumLanes; ++l) {
+    int LaneStart = l*LaneSize;
+    for (int i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize))
+        return false;
+      if (NumElts == 4 || l == 0)
+        continue;
+      // VPERMILPS handling
+      if (Mask[i] < 0)
+        continue;
+      if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize))
         return false;
     }
-
-  return true;
-}
-
-/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
-/// Note that VPERMIL mask matching is different depending whether theunderlying
-/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
-/// to the same elements of the low, but to the higher half of the source.
-/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                            const X86Subtarget *Subtarget) {
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
-
-  if (!Subtarget->hasAVX())
-    return false;
-
-  // Only match 256-bit with 32-bit types
-  if (VT.getSizeInBits() != 256 || NumElts != 8)
-    return false;
-
-  // The mask on the high lane should be the same as the low. Actually,
-  // they can differ if any of the corresponding index in a lane is undef
-  // and the other stays in range.
-  int LaneSize = NumElts/NumLanes;
-  for (int i = 0; i < LaneSize; ++i) {
-    int HighElt = i+LaneSize;
-    bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
-    bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
-
-    if (!HighValid || !LowValid)
-      return false;
-    if (Mask[i] < 0 || Mask[HighElt] < 0)
-      continue;
-    if (Mask[HighElt]-Mask[i] != LaneSize)
-      return false;
   }
 
   return true;
 }
 
-/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
-static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions.
+static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
   EVT VT = SVOp->getValueType(0);
 
   int NumElts = VT.getVectorNumElements();
@@ -3911,43 +3770,22 @@ static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
   // where a mask will match because the same mask element is undef on the
   // first half but valid on the second. This would get pathological cases
   // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
+  unsigned Shift = (LaneSize == 4) ? 2 : 1;
   unsigned Mask = 0;
-  for (int l = 0; l < NumLanes; ++l) {
-    for (int i = 0; i < LaneSize; ++i) {
-      int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
-      if (MaskElt < 0)
-        continue;
-      if (MaskElt >= LaneSize)
-        MaskElt -= LaneSize;
-      Mask |= MaskElt << (i*2);
-    }
+  for (int i = 0; i != NumElts; ++i) {
+    int MaskElt = SVOp->getMaskElt(i);
+    if (MaskElt < 0)
+      continue;
+    MaskElt %= LaneSize;
+    unsigned Shamt = i;
+    // VPERMILPSY, the mask of the first half must be equal to the second one
+    if (NumElts == 8) Shamt %= LaneSize;
+    Mask |= MaskElt << (Shamt*Shift);
   }
 
   return Mask;
 }
 
-/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
-static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VT = SVOp->getValueType(0);
-
-  int NumElts = VT.getVectorNumElements();
-  int NumLanes = VT.getSizeInBits()/128;
-
-  unsigned Mask = 0;
-  int LaneSize = NumElts/NumLanes;
-  for (int l = 0; l < NumLanes; ++l)
-    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
-      int MaskElt = SVOp->getMaskElt(i);
-      if (MaskElt < 0)
-        continue;
-      Mask |= (MaskElt-l*LaneSize) << i;
-    }
-
-  return Mask;
-}
-
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -4035,21 +3873,18 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
-                           const X86Subtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
+static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                           bool HasAVX) {
   int NumElts = VT.getVectorNumElements();
-  bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
 
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
-      !V2IsUndef || NumElts != 4)
+  if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
     return false;
 
   for (int i = 0; i != NumElts/2; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), 0))
+    if (!isUndefOrEqual(Mask[i], 0))
       return false;
   for (int i = NumElts/2; i != NumElts; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+    if (!isUndefOrEqual(Mask[i], NumElts/2))
       return false;
   return true;
 }
@@ -4164,14 +3999,13 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
 
 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
-unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VVT = N->getValueType(0);
-  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
   int Val = 0;
 
   unsigned i, e;
-  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+  for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
     Val = SVOp->getMaskElt(i);
     if (Val >= 0)
       break;
@@ -4631,29 +4465,14 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
     case X86ISD::SHUFPS:
     case X86ISD::SHUFPD:
       ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeSHUFPSMask(NumElems,
-                       cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                       ShuffleMask);
-      break;
-    case X86ISD::PUNPCKHBW:
-    case X86ISD::PUNPCKHWD:
-    case X86ISD::PUNPCKHDQ:
-    case X86ISD::PUNPCKHQDQ:
-      DecodePUNPCKHMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::UNPCKHPS:
-    case X86ISD::UNPCKHPD:
-      DecodeUNPCKHPMask(VT, ShuffleMask);
+      DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                      ShuffleMask);
       break;
-    case X86ISD::PUNPCKLBW:
-    case X86ISD::PUNPCKLWD:
-    case X86ISD::PUNPCKLDQ:
-    case X86ISD::PUNPCKLQDQ:
-      DecodePUNPCKLMask(VT, ShuffleMask);
+    case X86ISD::UNPCKH:
+      DecodeUNPCKHMask(VT, ShuffleMask);
       break;
-    case X86ISD::UNPCKLPS:
-    case X86ISD::UNPCKLPD:
-      DecodeUNPCKLPMask(VT, ShuffleMask);
+    case X86ISD::UNPCKL:
+      DecodeUNPCKLMask(VT, ShuffleMask);
       break;
     case X86ISD::MOVHLPS:
       DecodeMOVHLPSMask(NumElems, ShuffleMask);
@@ -4686,27 +4505,12 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                  Depth+1);
     }
-    case X86ISD::VPERMILPS:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::VPERMILPSY:
+    case X86ISD::VPERMILP:
       ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+      DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                         ShuffleMask);
       break;
-    case X86ISD::VPERMILPD:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::VPERMILPDY:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::VPERM2F128:
+    case X86ISD::VPERM2X128:
       ImmN = N->getOperand(N->getNumOperands()-1);
       DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                            ShuffleMask);
@@ -5334,8 +5138,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                            DAG);
       } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
-        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
-        EVT MiddleVT = MVT::v4i32;
+        unsigned NumBits = VT.getSizeInBits();
+        assert((NumBits == 128 || NumBits == 256) && 
+               "Expected an SSE or AVX value type!");
+        EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32;
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                            Subtarget->hasXMMInt(), DAG);
@@ -6256,7 +6062,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
     // from X.
     if (NumHi == 3) {
       // Normalize it so the 3 elements come from V1.
-      CommuteVectorShuffleMask(PermMask, VT);
+      CommuteVectorShuffleMask(PermMask, 4);
       std::swap(V1, V2);
     }
 
@@ -6566,70 +6372,6 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
                               X86::getShuffleSHUFImmediate(SVOp), DAG);
 }
 
-static inline unsigned getUNPCKLOpcode(EVT VT, bool HasAVX2) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v4i32: return X86ISD::PUNPCKLDQ;
-  case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
-  case MVT::v8i32:
-    if (HasAVX2)   return X86ISD::PUNPCKLDQ;
-    // else use fp unit for int unpack.
-  case MVT::v8f32:
-  case MVT::v4f32: return X86ISD::UNPCKLPS;
-  case MVT::v4i64:
-    if (HasAVX2)   return X86ISD::PUNPCKLQDQ;
-    // else use fp unit for int unpack.
-  case MVT::v4f64:
-  case MVT::v2f64: return X86ISD::UNPCKLPD;
-  case MVT::v32i8:
-  case MVT::v16i8: return X86ISD::PUNPCKLBW;
-  case MVT::v16i16:
-  case MVT::v8i16: return X86ISD::PUNPCKLWD;
-  default:
-    llvm_unreachable("Unknown type for unpckl");
-  }
-  return 0;
-}
-
-static inline unsigned getUNPCKHOpcode(EVT VT, bool HasAVX2) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v4i32: return X86ISD::PUNPCKHDQ;
-  case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
-  case MVT::v8i32:
-    if (HasAVX2)   return X86ISD::PUNPCKHDQ;
-    // else use fp unit for int unpack.
-  case MVT::v8f32:
-  case MVT::v4f32: return X86ISD::UNPCKHPS;
-  case MVT::v4i64:
-    if (HasAVX2)   return X86ISD::PUNPCKHQDQ;
-    // else use fp unit for int unpack.
-  case MVT::v4f64:
-  case MVT::v2f64: return X86ISD::UNPCKHPD;
-  case MVT::v32i8:
-  case MVT::v16i8: return X86ISD::PUNPCKHBW;
-  case MVT::v16i16:
-  case MVT::v8i16: return X86ISD::PUNPCKHWD;
-  default:
-    llvm_unreachable("Unknown type for unpckh");
-  }
-  return 0;
-}
-
-static inline unsigned getVPERMILOpcode(EVT VT) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v4i32:
-  case MVT::v4f32: return X86ISD::VPERMILPS;
-  case MVT::v2i64:
-  case MVT::v2f64: return X86ISD::VPERMILPD;
-  case MVT::v8i32:
-  case MVT::v8f32: return X86ISD::VPERMILPSY;
-  case MVT::v4i64:
-  case MVT::v4f64: return X86ISD::VPERMILPDY;
-  default:
-    llvm_unreachable("Unknown type for vpermil");
-  }
-  return 0;
-}
-
 static
 SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                const TargetLowering &TLI,
@@ -6703,17 +6445,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   unsigned NumElems = VT.getVectorNumElements();
-  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   bool V1IsSplat = false;
   bool V2IsSplat = false;
   bool HasXMMInt = Subtarget->hasXMMInt();
+  bool HasAVX    = Subtarget->hasAVX();
   bool HasAVX2   = Subtarget->hasAVX2();
   MachineFunction &MF = DAG.getMachineFunction();
   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
 
   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
 
+  assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef");
+
   // Vector shuffle lowering takes 3 steps:
   //
   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
@@ -6738,11 +6482,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   // unpckh_undef). Only use pshufd if speed is more important than size.
   if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
 
   if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() &&
       V2IsUndef && RelaxedMayFoldVectorLoad(V1))
@@ -6754,8 +6496,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // Use to match splats
   if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef &&
       (VT == MVT::v2f64 || VT == MVT::v2i64))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
 
   if (X86::isPSHUFDMask(SVOp)) {
     // The actual implementation will match the mask in the if above and then
@@ -6787,8 +6528,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (X86::isMOVLMask(SVOp)) {
-    if (V1IsUndef)
-      return V2;
     if (ISD::isBuildVectorAllZeros(V1.getNode()))
       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
     if (!X86::isMOVLPMask(SVOp)) {
@@ -6834,17 +6573,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   V2IsSplat = isSplatVector(V2.getNode());
 
   // Canonicalize the splat or undef, if present, to be on the RHS.
-  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+  if (V1IsSplat && !V2IsSplat) {
     Op = CommuteVectorShuffle(SVOp, DAG);
     SVOp = cast<ShuffleVectorSDNode>(Op);
     V1 = SVOp->getOperand(0);
     V2 = SVOp->getOperand(1);
     std::swap(V1IsSplat, V2IsSplat);
-    std::swap(V1IsUndef, V2IsUndef);
     Commuted = true;
   }
 
-  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+  SmallVector<int, 32> M;
+  SVOp->getMask(M);
+
+  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
     // Shuffling low element of v1 into undef, just return v1.
     if (V2IsUndef)
       return V1;
@@ -6854,13 +6595,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getMOVL(DAG, dl, VT, V2, V1);
   }
 
-  if (X86::isUNPCKLMask(SVOp, HasAVX2))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V2,
-                                DAG);
+  if (isUNPCKLMask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
 
-  if (X86::isUNPCKHMask(SVOp, HasAVX2))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V2,
-                                DAG);
+  if (isUNPCKHMask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
 
   if (V2IsSplat) {
     // Normalize mask so all entries that point to V2 points to its first
@@ -6884,35 +6623,30 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
 
     if (X86::isUNPCKLMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V2, V1,
-                                  DAG);
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG);
 
     if (X86::isUNPCKHMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V2, V1,
-                                  DAG);
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG);
   }
 
   // Normalize the node to match x86 shuffle ops if needed
-  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) ||
+                     isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true)))
     return CommuteVectorShuffle(SVOp, DAG);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
   // inlined here right now to enable us to directly emit target specific
   // nodes, and remove one by one until they don't return Op anymore.
-  SmallVector<int, 16> M;
-  SVOp->getMask(M);
 
   if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()))
     return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
-                                X86::getShufflePALIGNRImmediate(SVOp),
+                                getShufflePALIGNRImmediate(SVOp),
                                 DAG);
 
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64)
-      return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
-    if (VT == MVT::v2i64)
-      return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
+    if (VT == MVT::v2f64 || VT == MVT::v2i64)
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   }
 
   if (isPSHUFHWMask(M, VT))
@@ -6929,12 +6663,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
                                 X86::getShuffleSHUFImmediate(SVOp), DAG);
 
-  if (X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
-  if (X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
+  if (isUNPCKL_v_undef_Mask(M, VT))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (isUNPCKH_v_undef_Mask(M, VT))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
 
   //===--------------------------------------------------------------------===//
   // Generate target specific nodes for 128 or 256-bit shuffles only
@@ -6942,44 +6674,23 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   //
 
   // Handle VMOVDDUPY permutations
-  if (isMOVDDUPYMask(SVOp, Subtarget))
+  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
-  // Handle VPERMILPS* permutations
-  if (isVPERMILPSMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
-                                getShuffleVPERMILPSImmediate(SVOp), DAG);
-
-  // Handle VPERMILPD* permutations
-  if (isVPERMILPDMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
-                                getShuffleVPERMILPDImmediate(SVOp), DAG);
+  // Handle VPERMILPS/D* permutations
+  if (isVPERMILPMask(M, VT, HasAVX))
+    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+                                getShuffleVPERMILPImmediate(SVOp), DAG);
 
-  // Handle VPERM2F128 permutations
-  if (isVPERM2F128Mask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
-                                getShuffleVPERM2F128Immediate(SVOp), DAG);
+  // Handle VPERM2F128/VPERM2I128 permutations
+  if (isVPERM2X128Mask(M, VT, HasAVX))
+    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
+                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
-  // Handle VSHUFPSY permutations
-  if (isVSHUFPSYMask(M, VT, Subtarget))
+  // Handle VSHUFPS/DY permutations
+  if (isVSHUFPYMask(M, VT, HasAVX))
     return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
-                                getShuffleVSHUFPSYImmediate(SVOp), DAG);
-
-  // Handle VSHUFPDY permutations
-  if (isVSHUFPDYMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
-                                getShuffleVSHUFPDYImmediate(SVOp), DAG);
-
-  // Try to swap operands in the node to match x86 shuffle ops
-  if (isCommutedVSHUFPMask(M, VT, Subtarget)) {
-    // Now we need to commute operands.
-    SVOp = cast<ShuffleVectorSDNode>(CommuteVectorShuffle(SVOp, DAG));
-    V1 = SVOp->getOperand(0);
-    V2 = SVOp->getOperand(1);
-    unsigned Immediate = (NumElems == 4) ? getShuffleVSHUFPDYImmediate(SVOp):
-        getShuffleVSHUFPSYImmediate(SVOp);
-    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, Immediate, DAG);
-  }
+                                getShuffleVSHUFPYImmediate(SVOp), DAG);
 
   //===--------------------------------------------------------------------===//
   // Since no target specific shuffle was selected for this generic one,
@@ -7888,7 +7599,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
-  std::vector<Constant*> CV0;
+  SmallVector<Constant*,4> CV0;
   CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
   CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
   CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
@@ -7896,7 +7607,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   Constant *C0 = ConstantVector::get(CV0);
   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
 
-  std::vector<Constant*> CV1;
+  SmallVector<Constant*,2> CV1;
   CV1.push_back(
     ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
   CV1.push_back(
@@ -8176,17 +7887,13 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
   EVT EltVT = VT;
   if (VT.isVector())
     EltVT = VT.getVectorElementType();
-  std::vector<Constant*> CV;
+  SmallVector<Constant*,4> CV;
   if (EltVT == MVT::f64) {
     Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(2, C);
   } else {
     Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(4, C);
   }
   Constant *C = ConstantVector::get(CV);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8201,19 +7908,18 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT EltVT = VT;
-  if (VT.isVector())
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
-  std::vector<Constant*> CV;
+    NumElts = VT.getVectorNumElements();
+  }
+  SmallVector<Constant*,8> CV;
   if (EltVT == MVT::f64) {
     Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(NumElts, C);
   } else {
     Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(NumElts, C);
   }
   Constant *C = ConstantVector::get(CV);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8221,11 +7927,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, 16);
   if (VT.isVector()) {
+    MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
-                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                       DAG.getNode(ISD::XOR, dl, XORVT,
+                    DAG.getNode(ISD::BITCAST, dl, XORVT,
                                 Op.getOperand(0)),
-                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask)));
+                    DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
   } else {
     return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
   }
@@ -8254,7 +7961,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // type, and that won't be f80 since that is not custom lowered.
 
   // First get the sign bit of second operand.
-  std::vector<Constant*> CV;
+  SmallVector<Constant*,4> CV;
   if (SrcVT == MVT::f64) {
     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
@@ -9253,7 +8960,7 @@ SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
-          EnableSegmentedStacks) &&
+          getTargetMachine().Options.EnableSegmentedStacks) &&
          "This should be used only on Windows targets or when segmented stacks "
          "are being used");
   assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
@@ -9267,7 +8974,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   bool Is64Bit = Subtarget->is64Bit();
   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
 
-  if (EnableSegmentedStacks) {
+  if (getTargetMachine().Options.EnableSegmentedStacks) {
     MachineFunction &MF = DAG.getMachineFunction();
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
@@ -9403,7 +9110,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
-    assert(!UseSoftFloat &&
+    assert(!getTargetMachine().Options.UseSoftFloat &&
            !(DAG.getMachineFunction()
                 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
            Subtarget->hasXMM());
@@ -10472,7 +10179,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                     DAG.getConstant(4, MVT::i32));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R);
     // a += a
     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
 
@@ -10487,13 +10194,13 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                     DAG.getConstant(2, MVT::i32));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R);
     // a += a
     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
 
     // return pblendv(r, r+r, a);
     R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
-                    R, DAG.getNode(ISD::ADD, dl, VT, R, R));
+                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
     return R;
   }
 
@@ -11194,6 +10901,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::HADD:               return "X86ISD::HADD";
+  case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
@@ -11266,24 +10975,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MOVSLDUP_LD:        return "X86ISD::MOVSLDUP_LD";
   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
-  case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
-  case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
-  case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
-  case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
-  case X86ISD::PUNPCKLBW:          return "X86ISD::PUNPCKLBW";
-  case X86ISD::PUNPCKLWD:          return "X86ISD::PUNPCKLWD";
-  case X86ISD::PUNPCKLDQ:          return "X86ISD::PUNPCKLDQ";
-  case X86ISD::PUNPCKLQDQ:         return "X86ISD::PUNPCKLQDQ";
-  case X86ISD::PUNPCKHBW:          return "X86ISD::PUNPCKHBW";
-  case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
-  case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
-  case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
+  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
-  case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
-  case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
-  case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
-  case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
-  case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
+  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
+  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -11391,7 +11087,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
-    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX());
+    return false;
 
   // FIXME: pshufb, blends, shifts.
   return (VT.getVectorNumElements() == 2 ||
@@ -11419,7 +11115,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
     return (isMOVLMask(Mask, VT)  ||
             isCommutedMOVLMask(Mask, VT, true) ||
             isSHUFPMask(Mask, VT) ||
-            isCommutedSHUFPMask(Mask, VT));
+            isSHUFPMask(Mask, VT, /* Commuted */ true));
   }
   return false;
 }
@@ -12289,7 +11985,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction *MF = BB->getParent();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
-  assert(EnableSegmentedStacks);
+  assert(getTargetMachine().Options.EnableSegmentedStacks);
 
   unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
@@ -13169,7 +12865,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
-          if (!UnsafeFPMath &&
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
           std::swap(LHS, RHS);
@@ -13179,7 +12875,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       case ISD::SETOLE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMIN;
@@ -13197,7 +12893,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       case ISD::SETOGE:
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMAX;
@@ -13207,7 +12903,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
-          if (!UnsafeFPMath &&
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
           std::swap(LHS, RHS);
@@ -13233,7 +12929,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
@@ -13243,7 +12939,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         break;
       case ISD::SETUGT:
         // Converting this to a min would handle NaNs incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
           break;
         Opcode = X86ISD::FMIN;
@@ -13268,7 +12964,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
@@ -14048,7 +13744,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
-      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, X, Y);
+      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
     }
   }
@@ -14232,7 +13928,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  // If we are saving a concatination of two XMM registers, perform two stores.
+  // If we are saving a concatenation of two XMM registers, perform two stores.
   // This is better in Sandy Bridge cause one 256-bit mem op is done via two
   // 128-bit ones. If in the future the cost becomes only one memory access the
   // first version would be better.
@@ -14342,7 +14038,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
   const Function *F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
-  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
+  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
                      && Subtarget->hasXMMInt();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
@@ -14458,7 +14154,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 /// set to A, RHS to B, and the routine returns 'true'.
 /// Note that the binary operation should have the property that if one of the
 /// operands is UNDEF then the result is UNDEF.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   // Look for the following pattern: if
   //   A = < float a0, float a1, float a2, float a3 >
   //   B = < float b0, float b1, float b2, float b3 >
@@ -14474,7 +14170,18 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
     return false;
 
   EVT VT = LHS.getValueType();
-  unsigned N = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for horizontal add/sub");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+  // operate independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  assert((NumLaneElts % 2 == 0) &&
+         "Vector type should have an even number of elements in each lane");
+  unsigned HalfLaneElts = NumLaneElts/2;
 
   // View LHS in the form
   //   LHS = VECTOR_SHUFFLE A, B, LMask
@@ -14483,7 +14190,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   // type VT.
   SDValue A, B;
-  SmallVector<int, 8> LMask(N);
+  SmallVector<int, 16> LMask(NumElts);
   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
       A = LHS.getOperand(0);
@@ -14493,14 +14200,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
   } else {
     if (LHS.getOpcode() != ISD::UNDEF)
       A = LHS;
-    for (unsigned i = 0; i != N; ++i)
+    for (unsigned i = 0; i != NumElts; ++i)
       LMask[i] = i;
   }
 
   // Likewise, view RHS in the form
   //   RHS = VECTOR_SHUFFLE C, D, RMask
   SDValue C, D;
-  SmallVector<int, 8> RMask(N);
+  SmallVector<int, 16> RMask(NumElts);
   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
       C = RHS.getOperand(0);
@@ -14510,7 +14217,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
   } else {
     if (RHS.getOpcode() != ISD::UNDEF)
       C = RHS;
-    for (unsigned i = 0; i != N; ++i)
+    for (unsigned i = 0; i != NumElts; ++i)
       RMask[i] = i;
   }
 
@@ -14525,30 +14232,28 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
   // If A and B occur in reverse order in RHS, then "swap" them (which means
   // rewriting the mask).
   if (A != C)
-    for (unsigned i = 0; i != N; ++i) {
-      unsigned Idx = RMask[i];
-      if (Idx < N)
-        RMask[i] += N;
-      else if (Idx < 2*N)
-        RMask[i] -= N;
-    }
+    CommuteVectorShuffleMask(RMask, NumElts);
 
   // At this point LHS and RHS are equivalent to
   //   LHS = VECTOR_SHUFFLE A, B, LMask
   //   RHS = VECTOR_SHUFFLE A, B, RMask
   // Check that the masks correspond to performing a horizontal operation.
-  for (unsigned i = 0; i != N; ++i) {
-    unsigned LIdx = LMask[i], RIdx = RMask[i];
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int LIdx = LMask[i], RIdx = RMask[i];
 
     // Ignore any UNDEF components.
-    if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
-        || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+    if (LIdx < 0 || RIdx < 0 ||
+        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
       continue;
 
     // Check that successive elements are being operated on.  If not, this is
     // not a horizontal operation.
-    if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
-        !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
+    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
+    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
+    if (!(LIdx == Index && RIdx == Index + 1) &&
+        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
       return false;
   }
 
@@ -14565,7 +14270,8 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
 
   // Try to synthesize horizontal adds from adds of shuffles.
-  if (Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+  if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, true))
     return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
   return SDValue();
@@ -14579,7 +14285,8 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
 
   // Try to synthesize horizontal subs from subs of shuffles.
-  if (Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+  if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, false))
     return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
   return SDValue();
@@ -14783,7 +14490,8 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
 
   // Try to synthesize horizontal adds from adds of shuffles.
-  if ((Subtarget->hasSSSE3orAVX()) && (VT == MVT::v8i16 || VT == MVT::v4i32) &&
+  if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
 
@@ -14815,8 +14523,9 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
 
   // Try to synthesize horizontal adds from adds of shuffles.
   EVT VT = N->getValueType(0);
-  if ((Subtarget->hasSSSE3orAVX()) && (VT == MVT::v8i16 || VT == MVT::v4i32) &&
-      isHorizontalBinOp(Op0, Op1, false))
+  if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
@@ -14857,18 +14566,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::SHUFPS:      // Handle all target specific shuffles
   case X86ISD::SHUFPD:
   case X86ISD::PALIGN:
-  case X86ISD::PUNPCKHBW:
-  case X86ISD::PUNPCKHWD:
-  case X86ISD::PUNPCKHDQ:
-  case X86ISD::PUNPCKHQDQ:
-  case X86ISD::UNPCKHPS:
-  case X86ISD::UNPCKHPD:
-  case X86ISD::PUNPCKLBW:
-  case X86ISD::PUNPCKLWD:
-  case X86ISD::PUNPCKLDQ:
-  case X86ISD::PUNPCKLQDQ:
-  case X86ISD::UNPCKLPS:
-  case X86ISD::UNPCKLPD:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLHPS:
   case X86ISD::PSHUFD:
@@ -14876,11 +14575,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case X86ISD::VPERMILPS:
-  case X86ISD::VPERMILPSY:
-  case X86ISD::VPERMILPD:
-  case X86ISD::VPERMILPDY:
-  case X86ISD::VPERM2F128:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index ccff3a5..cfc1f88 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -273,23 +273,10 @@ namespace llvm {
       MOVLPD,
       MOVSD,
       MOVSS,
-      UNPCKLPS,
-      UNPCKLPD,
-      UNPCKHPS,
-      UNPCKHPD,
-      PUNPCKLBW,
-      PUNPCKLWD,
-      PUNPCKLDQ,
-      PUNPCKLQDQ,
-      PUNPCKHBW,
-      PUNPCKHWD,
-      PUNPCKHDQ,
-      PUNPCKHQDQ,
-      VPERMILPS,
-      VPERMILPSY,
-      VPERMILPD,
-      VPERMILPDY,
-      VPERM2F128,
+      UNPCKL,
+      UNPCKH,
+      VPERMILP,
+      VPERM2X128,
       VBROADCAST,
 
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
@@ -468,10 +455,6 @@ namespace llvm {
     /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction.
     unsigned getShufflePSHUFLWImmediate(SDNode *N);
 
-    /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
-    /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
-    unsigned getShufflePALIGNRImmediate(SDNode *N);
-
     /// getExtractVEXTRACTF128Immediate - Return the appropriate
     /// immediate to extract the specified EXTRACT_SUBVECTOR index
     /// with VEXTRACTF128 instructions.
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index d868773..f443088 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -58,3 +58,391 @@ let isAsmParserOnly = 1 in {
   defm VFNMSUBPS : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
   defm VFNMSUBPD : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
 }
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr> {
+  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, XOP_W;
+  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, XOP_W;
+  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>;
+
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr> {
+  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, XOP_W;
+  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, XOP_W;
+  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>;
+  def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, XOP_W;
+  def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, XOP_W;
+  def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VFMADDSS4    : fma4s<0x6A, "vfmaddss">;
+  defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd">;
+  defm VFMADDPS4    : fma4p<0x68, "vfmaddps">;
+  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd">;
+  defm VFMSUBSS4    : fma4s<0x6E, "vfmsubss">;
+  defm VFMSUBSD4    : fma4s<0x6F, "vfmsubsd">;
+  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps">;
+  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd">;
+  defm VFNMADDSS4   : fma4s<0x7A, "vfnmaddss">;
+  defm VFNMADDSD4   : fma4s<0x7B, "vfnmaddsd">;
+  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps">;
+  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd">;
+  defm VFNMSUBSS4   : fma4s<0x7E, "vfnmsubss">;
+  defm VFNMSUBSD4   : fma4s<0x7F, "vfnmsubsd">;
+  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps">;
+  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd">;
+  defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps">;
+  defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd">;
+  defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps">;
+  defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd">;
+}
+
+// FMA4 Intrinsics patterns
+
+// VFMADD
+def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv8f32 addr:$src3)),
+          (VFMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1,
+                                      (alignedloadv8f32 addr:$src2),
+                                      VR256:$src3),
+          (VFMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv4f64 addr:$src3)),
+          (VFMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1,
+                                      (alignedloadv4f64 addr:$src2),
+                                      VR256:$src3),
+          (VFMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFMSUB
+def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv8f32 addr:$src3)),
+          (VFMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1,
+                                      (alignedloadv8f32 addr:$src2),
+                                      VR256:$src3),
+          (VFMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv4f64 addr:$src3)),
+          (VFMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1,
+                                      (alignedloadv4f64 addr:$src2),
+                                      VR256:$src3),
+          (VFMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFNMADD
+def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFNMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFNMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFNMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFNMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFNMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFNMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFNMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFNMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFNMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv8f32 addr:$src3)),
+          (VFNMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1,
+                                      (alignedloadv8f32 addr:$src2),
+                                      VR256:$src3),
+          (VFNMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFNMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv4f64 addr:$src3)),
+          (VFNMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1,
+                                      (alignedloadv4f64 addr:$src2),
+                                      VR256:$src3),
+          (VFNMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFNMSUB
+def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFNMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFNMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFNMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFNMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFNMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFNMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFNMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFNMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFNMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFNMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv8f32 addr:$src3)),
+          (VFNMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1,
+                                      (alignedloadv8f32 addr:$src2),
+                                      VR256:$src3),
+          (VFNMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFNMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv4f64 addr:$src3)),
+          (VFNMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1,
+                                      (alignedloadv4f64 addr:$src2),
+                                      VR256:$src3),
+          (VFNMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFMADDSUB
+def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMADDSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFMADDSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFMADDSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMADDSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFMADDSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFMADDSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMADDSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv8f32 addr:$src3)),
+          (VFMADDSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1,
+                                      (alignedloadv8f32 addr:$src2),
+                                      VR256:$src3),
+          (VFMADDSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMADDSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv4f64 addr:$src3)),
+          (VFMADDSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1,
+                                      (alignedloadv4f64 addr:$src2),
+                                      VR256:$src3),
+          (VFMADDSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFMSUBADD
+def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMSUBADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2,
+                                  (alignedloadv4f32 addr:$src3)),
+          (VFMSUBADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+                                  VR128:$src3),
+          (VFMSUBADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
+          (VFMSUBADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2,
+                                  (alignedloadv2f64 addr:$src3)),
+          (VFMSUBADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+                                  VR128:$src3),
+          (VFMSUBADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMSUBADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv8f32 addr:$src3)),
+          (VFMSUBADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1,
+                                      (alignedloadv8f32 addr:$src2),
+                                      VR256:$src3),
+          (VFMSUBADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+          (VFMSUBADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2,
+                                  (alignedloadv4f64 addr:$src3)),
+          (VFMSUBADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1,
+                                      (alignedloadv4f64 addr:$src2),
+                                      VR256:$src3),
+          (VFMSUBADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index ecd6a93..7ba3639 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -110,6 +110,8 @@ class A7     { bits<5> Prefix = 16; }
 class T8XD   { bits<5> Prefix = 17; }
 class T8XS   { bits<5> Prefix = 18; }
 class TAXD   { bits<5> Prefix = 19; }
+class XOP8   { bits<5> Prefix = 20; }
+class XOP9   { bits<5> Prefix = 21; }
 class VEX    { bit hasVEXPrefix = 1; }
 class VEX_W  { bit hasVEX_WPrefix = 1; }
 class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
@@ -118,7 +120,8 @@ class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
 class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
-
+class XOP_W { bit hasXOP_WPrefix = 1; }
+class XOP { bit hasXOP_Prefix = 1; }
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr, Domain d = GenericDomain>
   : Instruction {
@@ -158,6 +161,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
   bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
   bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+  bit hasXOP_WPrefix = 0;   // Same bit as VEX_W, but used for swapping operands
+  bit hasXOP_Prefix = 0;    // Does this inst require an XOP prefix?
 
   // TSFlags layout should be kept in sync with X86InstrInfo.h.
   let TSFlags{5-0}   = FormBits;
@@ -179,6 +184,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{38}    = hasVEX_L;
   let TSFlags{39}    = ignoresVEX_L;
   let TSFlags{40}    = has3DNow0F0FOpcode;
+  let TSFlags{41}    = hasXOP_WPrefix;
+  let TSFlags{42}    = hasXOP_Prefix;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -332,6 +339,10 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, TB,
         Requires<[HasAVX]>;
+class VoPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+        Requires<[HasXMM]>;
 
 // SSE2 Instruction Templates:
 // 
@@ -496,6 +507,30 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
       : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
         OpSize, VEX_4V, Requires<[HasFMA3]>;
 
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
+         XOP, XOP9, Requires<[HasXOP]>;
+
+// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
+         XOP, XOP8, Requires<[HasXOP]>;
+
+//  XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+
 // X86-64 Instruction templates...
 //
 
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 791bbe6..cd13bc4 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -130,28 +130,12 @@ def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
-def X86Unpcklps  : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
-def X86Unpcklpd  : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
-def X86Unpckhps  : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
-def X86Unpckhpd  : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
+def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
 
-def X86Punpcklbw  : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>;
-def X86Punpcklwd  : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>;
-def X86Punpckldq  : SDNode<"X86ISD::PUNPCKLDQ", SDTShuff2Op>;
-def X86Punpcklqdq : SDNode<"X86ISD::PUNPCKLQDQ", SDTShuff2Op>;
-
-def X86Punpckhbw  : SDNode<"X86ISD::PUNPCKHBW", SDTShuff2Op>;
-def X86Punpckhwd  : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
-def X86Punpckhdq  : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
-def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
-
-def X86VPermilps  : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>;
-def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>;
-def X86VPermilpd  : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>;
-def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>;
-
-def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>;
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 
@@ -363,12 +347,6 @@ def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
   return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
 }]>;
 
-// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
-// a PALIGNR imm.
-def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
-  return getI8Imm(X86::getShufflePALIGNRImmediate(N));
-}]>;
-
 // EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index
 // to VEXTRACTF128 imm.
 def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 24c4a53..7d1b9a1 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1528,9 +1528,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
       leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
       // well be shifting and then extracting the lower 16-bits.
-      BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+      BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
       InsMI2 =
-        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+        BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
         .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
         .addReg(Src2, getKillRegState(isKill2));
       addRegReg(MIB, leaInReg, true, leaInReg2, true);
@@ -2040,13 +2040,12 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
 }
 
 bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.isTerminator()) return false;
+  if (!MI->isTerminator()) return false;
 
   // Conditional branch is a special case.
-  if (MCID.isBranch() && !MCID.isBarrier())
+  if (MI->isBranch() && !MI->isBarrier())
     return true;
-  if (!MCID.isPredicable())
+  if (!MI->isPredicable())
     return true;
   return !isPredicated(MI);
 }
@@ -2072,7 +2071,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     // A terminator that isn't a branch can't easily be handled by this
     // analysis.
-    if (!I->getDesc().isBranch())
+    if (!I->isBranch())
       return true;
 
     // Handle unconditional branches.
@@ -2556,6 +2555,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (MI->getOpcode()) {
   case X86::V_SET0:
+  case X86::FsFLD0SS:
+  case X86::FsFLD0SD:
     return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
@@ -2771,7 +2772,9 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::RCPSSr:
   case X86::RCPSSr_Int:
   case X86::ROUNDSDr:
+  case X86::ROUNDSDr_Int:
   case X86::ROUNDSSr:
+  case X86::ROUNDSSr_Int:
   case X86::RSQRTSSr:
   case X86::RSQRTSSr_Int:
   case X86::SQRTSSr:
@@ -2783,7 +2786,9 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::Int_VCVTSS2SDrr:
   case X86::VRCPSSr:
   case X86::VROUNDSDr:
+  case X86::VROUNDSDr_Int:
   case X86::VROUNDSSr:
+  case X86::VROUNDSSr_Int:
   case X86::VRSQRTSSr:
   case X86::VSQRTSSr:
     return true;
@@ -2911,11 +2916,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
-    case X86::VFsFLD0SD:
       Alignment = 8;
       break;
     case X86::FsFLD0SS:
-    case X86::VFsFLD0SS:
       Alignment = 4;
       break;
     default:
@@ -2950,9 +2953,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   case X86::AVX_SETALLONES:
   case X86::AVX2_SETALLONES:
   case X86::FsFLD0SD:
-  case X86::FsFLD0SS:
-  case X86::VFsFLD0SD:
-  case X86::VFsFLD0SS: {
+  case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
@@ -2978,9 +2979,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     MachineConstantPool &MCP = *MF.getConstantPool();
     Type *Ty;
     unsigned Opc = LoadMI->getOpcode();
-    if (Opc == X86::FsFLD0SS || Opc == X86::VFsFLD0SS)
+    if (Opc == X86::FsFLD0SS)
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
-    else if (Opc == X86::FsFLD0SD || Opc == X86::VFsFLD0SD)
+    else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
     else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY)
       Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
@@ -3569,7 +3570,13 @@ static const unsigned ReplaceableInstrsAVX2[][3] = {
   { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
   { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
   { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
-  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    }
+  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+  { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
+  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr }
 };
 
 // FIXME: Some shuffle and unpack instructions have equivalents in different
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 35631d5..0bc3afa 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -473,6 +473,7 @@ def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasXMM       : Predicate<"Subtarget->hasXMM()">;
 def HasXMMInt    : Predicate<"Subtarget->hasXMMInt()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
@@ -480,6 +481,7 @@ def HasAES       : Predicate<"Subtarget->hasAES()">;
 def HasCLMUL     : Predicate<"Subtarget->hasCLMUL()">;
 def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP       : Predicate<"Subtarget->hasXOP()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
 def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
 def HasF16C      : Predicate<"Subtarget->hasF16C()">;
@@ -1502,6 +1504,9 @@ include "X86InstrFragmentsSIMD.td"
 // FMA - Fused Multiply-Add support (requires FMA)
 include "X86InstrFMA.td"
 
+// XOP
+include "X86InstrXOP.td"
+
 // SSE, MMX and 3DNow! vector support.
 include "X86InstrSSE.td"
 include "X86InstrMMX.td"
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 7cadac1..345f606 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -240,21 +240,13 @@ let Predicates = [HasAVX] in {
 }
 
 // Alias instructions that map fld0 to pxor for sse.
-// FIXME: Set encoding to pseudo!
-let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
-    canFoldAsLoad = 1 in {
-  def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
-                   [(set FR32:$dst, fp32imm0)]>,
-                   Requires<[HasSSE1]>, TB, OpSize;
-  def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
-                   [(set FR64:$dst, fpimm0)]>,
-                 Requires<[HasSSE2]>, TB, OpSize;
-  def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
-                    [(set FR32:$dst, fp32imm0)]>,
-                    Requires<[HasAVX]>, TB, OpSize, VEX_4V;
-  def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
-                    [(set FR64:$dst, fpimm0)]>,
-                    Requires<[HasAVX]>, TB, OpSize, VEX_4V;
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1 in {
+  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasXMM]>;
+  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+                   [(set FR64:$dst, fpimm0)]>, Requires<[HasXMMInt]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -569,6 +561,16 @@ let Predicates = [HasAVX] in {
                       (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
             (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (v4f32 (V_SET0)),
+                        (EXTRACT_SUBREG (v8f32 VR256:$src), sub_ss)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (v4i32 (V_SET0)),
+                        (EXTRACT_SUBREG (v8i32 VR256:$src), sub_ss)), sub_xmm)>;
   }
 
   let AddedComplexity = 20 in {
@@ -596,6 +598,9 @@ let Predicates = [HasAVX] in {
 
   // Represent the same patterns above but in the form they appear for
   // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
@@ -613,6 +618,15 @@ let Predicates = [HasAVX] in {
             (SUBREG_TO_REG (i64 0),
                            (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                            sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (v2f64 (V_SET0)),
+                        (EXTRACT_SUBREG (v4f64 VR256:$src), sub_sd)), sub_xmm)>;
 
   // Extract and store.
   def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
@@ -634,6 +648,16 @@ let Predicates = [HasAVX] in {
             (VMOVSSrr (v4f32 VR128:$src1),
                       (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
 
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+                (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_ss),
+                          (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_ss)), sub_xmm)>;
+  def : Pat<(v8f32 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+                (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_ss),
+                          (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_ss)), sub_xmm)>;
+
   // Shuffle with VMOVSD
   def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
             (VMOVSDrr VR128:$src1, FR64:$src2)>;
@@ -650,6 +674,17 @@ let Predicates = [HasAVX] in {
             (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
                                                    sub_sd))>;
 
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+                (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_sd),
+                          (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_sd)), sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+                (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_sd),
+                          (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_sd)), sub_xmm)>;
+
+
   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
   // is during lowering, where it's not possible to recognize the fold cause
   // it has two uses through a bitcast. One use disappears at isel time and the
@@ -657,6 +692,9 @@ let Predicates = [HasAVX] in {
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
             (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),
                                                    sub_sd))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),
+                                                   sub_sd))>;
   def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
             (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
                                                    sub_sd))>;
@@ -761,6 +799,22 @@ let isCodeGenOnly = 1 in {
                             "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
 }
 
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86vzmovl
+                        (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzmovl
+                        (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v8f32 (X86vzmovl
+                        (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4f64 (X86vzmovl
+                        (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+}
+
+
 def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
 def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
           (VMOVUPSYmr addr:$dst, VR256:$src)>;
@@ -1156,14 +1210,17 @@ let Predicates = [HasAVX] in {
                  (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
             (VMOVHPSrm VR128:$src1, addr:$src2)>;
   def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
                  (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
             (VMOVHPSrm VR128:$src1, addr:$src2)>;
 
-  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   // is during lowering, where it's not possible to recognize the load fold cause
   // it has two uses through a bitcast. One use disappears at isel time and the
   // fold opportunity reappears.
-  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
                       (scalar_to_vector (loadf64 addr:$src2)))),
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 
@@ -1174,10 +1231,10 @@ let Predicates = [HasAVX] in {
 
   // Store patterns
   def : Pat<(store (f64 (vector_extract
-            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst),
             (VMOVHPSmr addr:$dst, VR128:$src)>;
   def : Pat<(store (f64 (vector_extract
-            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst),
             (VMOVHPDmr addr:$dst, VR128:$src)>;
 }
 
@@ -1189,21 +1246,24 @@ let Predicates = [HasSSE1] in {
                  (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
             (MOVHPSrm VR128:$src1, addr:$src2)>;
   def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
                  (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
             (MOVHPSrm VR128:$src1, addr:$src2)>;
 
   // Store patterns
   def : Pat<(store (f64 (vector_extract
-            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst),
             (MOVHPSmr addr:$dst, VR128:$src)>;
 }
 
 let Predicates = [HasSSE2] in {
-  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   // is during lowering, where it's not possible to recognize the load fold cause
   // it has two uses through a bitcast. One use disappears at isel time and the
   // fold opportunity reappears.
-  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
                       (scalar_to_vector (loadf64 addr:$src2)))),
             (MOVHPDrm VR128:$src1, addr:$src2)>;
 
@@ -1214,7 +1274,7 @@ let Predicates = [HasSSE2] in {
 
   // Store patterns
   def : Pat<(store (f64 (vector_extract
-            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst),
+            (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))),addr:$dst),
             (MOVHPDmr addr:$dst, VR128:$src)>;
 }
 
@@ -1943,7 +2003,7 @@ def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 // whenever possible to avoid declaring two versions of each one.
 def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
           (VCVTDQ2PSYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_ps_256 (memopv8i32 addr:$src)),
+def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
           (VCVTDQ2PSYrm addr:$src)>;
 
 def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
@@ -2430,27 +2490,27 @@ let AddedComplexity = 10 in {
 } // AddedComplexity
 
 let Predicates = [HasSSE1] in {
-  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+  def : Pat<(v4f32 (X86Unpckl VR128:$src1, (memopv4f32 addr:$src2))),
             (UNPCKLPSrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+  def : Pat<(v4f32 (X86Unpckl VR128:$src1, VR128:$src2)),
             (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+  def : Pat<(v4f32 (X86Unpckh VR128:$src1, (memopv4f32 addr:$src2))),
             (UNPCKHPSrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+  def : Pat<(v4f32 (X86Unpckh VR128:$src1, VR128:$src2)),
             (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
 }
 
 let Predicates = [HasSSE2] in {
-  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (memopv2f64 addr:$src2))),
             (UNPCKLPDrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1, VR128:$src2)),
             (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+  def : Pat<(v2f64 (X86Unpckh VR128:$src1, (memopv2f64 addr:$src2))),
             (UNPCKHPDrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+  def : Pat<(v2f64 (X86Unpckh VR128:$src1, VR128:$src2)),
             (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
 
-  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
+  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
   // time and the fold opportunity reappears.
@@ -2463,59 +2523,43 @@ let Predicates = [HasSSE2] in {
 }
 
 let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+  def : Pat<(v4f32 (X86Unpckl VR128:$src1, (memopv4f32 addr:$src2))),
             (VUNPCKLPSrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+  def : Pat<(v4f32 (X86Unpckl VR128:$src1, VR128:$src2)),
             (VUNPCKLPSrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+  def : Pat<(v4f32 (X86Unpckh VR128:$src1, (memopv4f32 addr:$src2))),
             (VUNPCKHPSrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+  def : Pat<(v4f32 (X86Unpckh VR128:$src1, VR128:$src2)),
             (VUNPCKHPSrr VR128:$src1, VR128:$src2)>;
 
-  def : Pat<(v8f32 (X86Unpcklps VR256:$src1, (memopv8f32 addr:$src2))),
+  def : Pat<(v8f32 (X86Unpckl VR256:$src1, (memopv8f32 addr:$src2))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8f32 (X86Unpcklps VR256:$src1, VR256:$src2)),
-            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpcklps VR256:$src1, VR256:$src2)),
+  def : Pat<(v8f32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpcklps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8f32 (X86Unpckhps VR256:$src1, (memopv8f32 addr:$src2))),
-            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8f32 (X86Unpckhps VR256:$src1, VR256:$src2)),
-            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckhps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+  def : Pat<(v8f32 (X86Unpckh VR256:$src1, (memopv8f32 addr:$src2))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckhps VR256:$src1, VR256:$src2)),
+  def : Pat<(v8f32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
 
-  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (memopv2f64 addr:$src2))),
             (VUNPCKLPDrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1, VR128:$src2)),
             (VUNPCKLPDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+  def : Pat<(v2f64 (X86Unpckh VR128:$src1, (memopv2f64 addr:$src2))),
             (VUNPCKHPDrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+  def : Pat<(v2f64 (X86Unpckh VR128:$src1, VR128:$src2)),
             (VUNPCKHPDrr VR128:$src1, VR128:$src2)>;
 
-  def : Pat<(v4f64 (X86Unpcklpd VR256:$src1, (memopv4f64 addr:$src2))),
+  def : Pat<(v4f64 (X86Unpckl VR256:$src1, (memopv4f64 addr:$src2))),
             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4f64 (X86Unpcklpd VR256:$src1, VR256:$src2)),
+  def : Pat<(v4f64 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpcklpd VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpcklpd VR256:$src1, VR256:$src2)),
-            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4f64 (X86Unpckhpd VR256:$src1, (memopv4f64 addr:$src2))),
-            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4f64 (X86Unpckhpd VR256:$src1, VR256:$src2)),
-            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckhpd VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4f64 (X86Unpckh VR256:$src1, (memopv4f64 addr:$src2))),
             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckhpd VR256:$src1, VR256:$src2)),
+  def : Pat<(v4f64 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
 
-  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
+  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
   // time and the fold opportunity reappears.
@@ -2869,7 +2913,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
   def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
-                (ins ssmem:$src1, VR128:$src2),
+                (ins VR128:$src1, ssmem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
 }
@@ -3198,13 +3242,13 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 //===----------------------------------------------------------------------===//
 
 // Prefetch intrinsic.
-def PREFETCHT0   : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
+def PREFETCHT0   : VoPSI<0x18, MRM1m, (outs), (ins i8mem:$src),
     "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>;
-def PREFETCHT1   : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
+def PREFETCHT1   : VoPSI<0x18, MRM2m, (outs), (ins i8mem:$src),
     "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>;
-def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
+def PREFETCHT2   : VoPSI<0x18, MRM3m, (outs), (ins i8mem:$src),
     "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>;
-def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
+def PREFETCHNTA  : VoPSI<0x18, MRM0m, (outs), (ins i8mem:$src),
     "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>;
 
 // Flush cache
@@ -3652,6 +3696,8 @@ defm VPOR  : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64,
                           i128mem, 1, 0>, VEX_4V;
 defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64,
                           i128mem, 1, 0>, VEX_4V;
+defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64,
+                          i128mem, 0, 0>, VEX_4V;
 
 let ExeDomain = SSEPackedInt in {
   let neverHasSideEffects = 1 in {
@@ -3666,17 +3712,6 @@ let ExeDomain = SSEPackedInt in {
                       VEX_4V;
     // PSRADQri doesn't exist in SSE[1-3].
   }
-  def VPANDNrr : PDI<0xDF, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                    "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst,
-                          (v2i64 (X86andnp VR128:$src1, VR128:$src2)))]>,VEX_4V;
-
-  def VPANDNrm : PDI<0xDF, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                    "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (X86andnp VR128:$src1,
-                                            (memopv2i64 addr:$src2)))]>, VEX_4V;
 }
 }
 
@@ -3714,6 +3749,8 @@ defm VPORY  : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
                            i256mem, 1, 0>, VEX_4V;
 defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
                            i256mem, 1, 0>, VEX_4V;
+defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
+                            i256mem, 0, 0>, VEX_4V;
 
 let ExeDomain = SSEPackedInt in {
   let neverHasSideEffects = 1 in {
@@ -3728,17 +3765,6 @@ let ExeDomain = SSEPackedInt in {
                       VEX_4V;
     // PSRADQYri doesn't exist in SSE[1-3].
   }
-  def VPANDNYrr : PDI<0xDF, MRMSrcReg,
-                     (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
-                     "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set VR256:$dst,
-                          (v4i64 (X86andnp VR256:$src1, VR256:$src2)))]>,VEX_4V;
-
-  def VPANDNYrm : PDI<0xDF, MRMSrcMem,
-                     (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
-                     "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set VR256:$dst, (X86andnp VR256:$src1,
-                                            (memopv4i64 addr:$src2)))]>, VEX_4V;
 }
 }
 
@@ -3776,6 +3802,8 @@ defm POR  : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64,
                          i128mem, 1>;
 defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64,
                          i128mem, 1>;
+defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64,
+                          i128mem, 0>;
 
 let ExeDomain = SSEPackedInt in {
   let neverHasSideEffects = 1 in {
@@ -3787,14 +3815,6 @@ let ExeDomain = SSEPackedInt in {
                          (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                          "psrldq\t{$src2, $dst|$dst, $src2}", []>;
     // PSRADQri doesn't exist in SSE[1-3].
-    def PANDNrr : PDI<0xDF, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                      "pandn\t{$src2, $dst|$dst, $src2}", []>;
-
-    let mayLoad = 1 in
-    def PANDNrm : PDI<0xDF, MRMSrcMem,
-                      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                      "pandn\t{$src2, $dst|$dst, $src2}", []>;
   }
 }
 } // Constraints = "$src1 = $dst"
@@ -4198,66 +4218,88 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
 }
 
 let Predicates = [HasAVX] in {
-  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw,
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
                                  bc_v16i8, 0>, VEX_4V;
-  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Punpcklwd,
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
                                  bc_v8i16, 0>, VEX_4V;
-  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq,
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
                                  bc_v4i32, 0>, VEX_4V;
-  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Punpcklqdq,
+  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
                                  bc_v2i64, 0>, VEX_4V;
 
-  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw,
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
                                  bc_v16i8, 0>, VEX_4V;
-  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Punpckhwd,
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
                                  bc_v8i16, 0>, VEX_4V;
-  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq,
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
                                  bc_v4i32, 0>, VEX_4V;
-  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Punpckhqdq,
+  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
                                  bc_v2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
-  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Punpcklbw,
+  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
                                    bc_v32i8>, VEX_4V;
-  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Punpcklwd,
+  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
                                    bc_v16i16>, VEX_4V;
-  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Punpckldq,
+  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
                                    bc_v8i32>, VEX_4V;
-  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Punpcklqdq,
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
                                    bc_v4i64>, VEX_4V;
 
-  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Punpckhbw,
+  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
                                    bc_v32i8>, VEX_4V;
-  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Punpckhwd,
+  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
                                    bc_v16i16>, VEX_4V;
-  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Punpckhdq,
+  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
                                    bc_v8i32>, VEX_4V;
-  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Punpckhqdq,
+  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
                                    bc_v4i64>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw,
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
                                 bc_v16i8>;
-  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd,
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
                                 bc_v8i16>;
-  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq,
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
                                 bc_v4i32>;
-  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Punpcklqdq,
+  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
                                 bc_v2i64>;
 
-  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw,
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
                                 bc_v16i8>;
-  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd,
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
                                 bc_v8i16>;
-  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq,
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
                                 bc_v4i32>;
-  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Punpckhqdq,
+  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
                                 bc_v2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
+// Patterns for using AVX1 instructions with integer vectors
+// Here to give AVX2 priority
+let Predicates = [HasAVX] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
 // Splat v2f64 / v2i64
 let AddedComplexity = 10 in {
   def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
@@ -4784,7 +4826,7 @@ def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // AVX 256-bit register conversion intrinsics
 def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
            (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_pd_256 (memopv4i32 addr:$src)),
+def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))),
            (VCVTDQ2PDYrm addr:$src)>;
 
 def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
@@ -4794,7 +4836,7 @@ def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
 
 def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
           (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(v4f64 (sint_to_fp (memopv4i32 addr:$src))),
+def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
           (VCVTDQ2PDYrm addr:$src)>;
 
 //===---------------------------------------------------------------------===//
@@ -5085,7 +5127,7 @@ let Constraints = "$src1 = $dst" in {
 
 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
-                            PatFrag mem_frag128, Intrinsic IntId128> {
+                            Intrinsic IntId128> {
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -5097,12 +5139,12 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
                       (IntId128
-                       (bitconvert (mem_frag128 addr:$src))))]>, OpSize;
+                       (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
 }
 
 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
-                              PatFrag mem_frag256, Intrinsic IntId256> {
+                              Intrinsic IntId256> {
   def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
                     (ins VR256:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -5114,32 +5156,32 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst,
                       (IntId256
-                       (bitconvert (mem_frag256 addr:$src))))]>, OpSize;
+                       (bitconvert (memopv4i64 addr:$src))))]>, OpSize;
 }
 
 let Predicates = [HasAVX] in {
-  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8,
+  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
                                   int_x86_ssse3_pabs_b_128>, VEX;
-  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16,
+  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
                                   int_x86_ssse3_pabs_w_128>, VEX;
-  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", memopv4i32,
+  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
                                   int_x86_ssse3_pabs_d_128>, VEX;
 }
 
 let Predicates = [HasAVX2] in {
-  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb", memopv32i8,
+  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
                                     int_x86_avx2_pabs_b>, VEX;
-  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw", memopv16i16,
+  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
                                     int_x86_avx2_pabs_w>, VEX;
-  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd", memopv8i32,
+  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
                                     int_x86_avx2_pabs_d>, VEX;
 }
 
-defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", memopv16i8,
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
                               int_x86_ssse3_pabs_b_128>;
-defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", memopv8i16,
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
                               int_x86_ssse3_pabs_w_128>;
-defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32,
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
                               int_x86_ssse3_pabs_d_128>;
 
 //===---------------------------------------------------------------------===//
@@ -5148,8 +5190,7 @@ defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32,
 
 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                             PatFrag mem_frag128, Intrinsic IntId128,
-                             bit Is2Addr = 1> {
+                             Intrinsic IntId128, bit Is2Addr = 1> {
   let isCommutable = 1 in
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
@@ -5165,11 +5206,11 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (mem_frag128 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
 }
 
 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
-                               PatFrag mem_frag256, Intrinsic IntId256> {
+                               Intrinsic IntId256> {
   let isCommutable = 1 in
   def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
        (ins VR256:$src1, VR256:$src2),
@@ -5181,94 +5222,94 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (mem_frag256 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
-  defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16,
+  defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw",
                                       int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
-  defm VPHADDD    : SS3I_binop_rm_int<0x02, "vphaddd", memopv4i32,
+  defm VPHADDD    : SS3I_binop_rm_int<0x02, "vphaddd",
                                       int_x86_ssse3_phadd_d_128, 0>, VEX_4V;
-  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw", memopv8i16,
+  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128, 0>, VEX_4V;
-  defm VPHSUBW    : SS3I_binop_rm_int<0x05, "vphsubw", memopv8i16,
+  defm VPHSUBW    : SS3I_binop_rm_int<0x05, "vphsubw",
                                       int_x86_ssse3_phsub_w_128, 0>, VEX_4V;
-  defm VPHSUBD    : SS3I_binop_rm_int<0x06, "vphsubd", memopv4i32,
+  defm VPHSUBD    : SS3I_binop_rm_int<0x06, "vphsubd",
                                       int_x86_ssse3_phsub_d_128, 0>, VEX_4V;
-  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw", memopv8i16,
+  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128, 0>, VEX_4V;
-  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv16i8,
+  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
                                       int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V;
-  defm VPSHUFB    : SS3I_binop_rm_int<0x00, "vpshufb", memopv16i8,
+  defm VPSHUFB    : SS3I_binop_rm_int<0x00, "vpshufb",
                                       int_x86_ssse3_pshuf_b_128, 0>, VEX_4V;
-  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb", memopv16i8,
+  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128, 0>, VEX_4V;
-  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw", memopv8i16,
+  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128, 0>, VEX_4V;
-  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd", memopv4i32,
+  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128, 0>, VEX_4V;
 }
-defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv8i16,
+defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
                                       int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
-  defm VPHADDW    : SS3I_binop_rm_int_y<0x01, "vphaddw", memopv16i16,
+  defm VPHADDW    : SS3I_binop_rm_int_y<0x01, "vphaddw",
                                         int_x86_avx2_phadd_w>, VEX_4V;
-  defm VPHADDD    : SS3I_binop_rm_int_y<0x02, "vphaddd", memopv8i32,
+  defm VPHADDD    : SS3I_binop_rm_int_y<0x02, "vphaddd",
                                         int_x86_avx2_phadd_d>, VEX_4V;
-  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw", memopv16i16,
+  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                         int_x86_avx2_phadd_sw>, VEX_4V;
-  defm VPHSUBW    : SS3I_binop_rm_int_y<0x05, "vphsubw", memopv16i16,
+  defm VPHSUBW    : SS3I_binop_rm_int_y<0x05, "vphsubw",
                                         int_x86_avx2_phsub_w>, VEX_4V;
-  defm VPHSUBD    : SS3I_binop_rm_int_y<0x06, "vphsubd", memopv8i32,
+  defm VPHSUBD    : SS3I_binop_rm_int_y<0x06, "vphsubd",
                                         int_x86_avx2_phsub_d>, VEX_4V;
-  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw", memopv16i16,
+  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
                                         int_x86_avx2_phsub_sw>, VEX_4V;
-  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", memopv32i8,
+  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
                                         int_x86_avx2_pmadd_ub_sw>, VEX_4V;
-  defm VPSHUFB    : SS3I_binop_rm_int_y<0x00, "vpshufb", memopv32i8,
+  defm VPSHUFB    : SS3I_binop_rm_int_y<0x00, "vpshufb",
                                         int_x86_avx2_pshuf_b>, VEX_4V;
-  defm VPSIGNB    : SS3I_binop_rm_int_y<0x08, "vpsignb", memopv32i8,
+  defm VPSIGNB    : SS3I_binop_rm_int_y<0x08, "vpsignb",
                                         int_x86_avx2_psign_b>, VEX_4V;
-  defm VPSIGNW    : SS3I_binop_rm_int_y<0x09, "vpsignw", memopv16i16,
+  defm VPSIGNW    : SS3I_binop_rm_int_y<0x09, "vpsignw",
                                         int_x86_avx2_psign_w>, VEX_4V;
-  defm VPSIGND    : SS3I_binop_rm_int_y<0x0A, "vpsignd", memopv8i32,
+  defm VPSIGND    : SS3I_binop_rm_int_y<0x0A, "vpsignd",
                                         int_x86_avx2_psign_d>, VEX_4V;
 }
-defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", memopv16i16,
+defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
                                         int_x86_avx2_pmul_hr_sw>, VEX_4V;
 }
 
 // None of these have i8 immediate fields.
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 let isCommutable = 0 in {
-  defm PHADDW    : SS3I_binop_rm_int<0x01, "phaddw", memopv8i16,
+  defm PHADDW    : SS3I_binop_rm_int<0x01, "phaddw",
                                      int_x86_ssse3_phadd_w_128>;
-  defm PHADDD    : SS3I_binop_rm_int<0x02, "phaddd", memopv4i32,
+  defm PHADDD    : SS3I_binop_rm_int<0x02, "phaddd",
                                      int_x86_ssse3_phadd_d_128>;
-  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw", memopv8i16,
+  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
                                      int_x86_ssse3_phadd_sw_128>;
-  defm PHSUBW    : SS3I_binop_rm_int<0x05, "phsubw", memopv8i16,
+  defm PHSUBW    : SS3I_binop_rm_int<0x05, "phsubw",
                                      int_x86_ssse3_phsub_w_128>;
-  defm PHSUBD    : SS3I_binop_rm_int<0x06, "phsubd", memopv4i32,
+  defm PHSUBD    : SS3I_binop_rm_int<0x06, "phsubd",
                                      int_x86_ssse3_phsub_d_128>;
-  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw", memopv8i16,
+  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
                                      int_x86_ssse3_phsub_sw_128>;
-  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv16i8,
+  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
                                      int_x86_ssse3_pmadd_ub_sw_128>;
-  defm PSHUFB    : SS3I_binop_rm_int<0x00, "pshufb", memopv16i8,
+  defm PSHUFB    : SS3I_binop_rm_int<0x00, "pshufb",
                                      int_x86_ssse3_pshuf_b_128>;
-  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", memopv16i8,
+  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb",
                                      int_x86_ssse3_psign_b_128>;
-  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", memopv8i16,
+  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw",
                                      int_x86_ssse3_psign_w_128>;
-  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", memopv4i32,
+  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd",
                                        int_x86_ssse3_psign_d_128>;
 }
-defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16,
+defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
                                      int_x86_ssse3_pmul_hr_sw_128>;
 }
 
@@ -6017,8 +6058,18 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
                             Intrinsic F32Int,
                             Intrinsic F64Int, bit Is2Addr = 1> {
 let ExeDomain = GenericDomain in {
-  // Intrinsic operation, reg.
+  // Operation, reg.
   def SSr : SS4AIi8<opcss, MRMSrcReg,
+      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>, OpSize;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -6040,8 +6091,18 @@ let ExeDomain = GenericDomain in {
              (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
         OpSize;
 
-  // Intrinsic operation, reg.
+  // Operation, reg.
   def SDr : SS4AIi8<opcsd, MRMSrcReg,
+        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        []>, OpSize;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -6079,6 +6140,27 @@ let Predicates = [HasAVX] in {
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
                                   int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+
+  def : Pat<(ffloor FR32:$src),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6088,6 +6170,27 @@ let Constraints = "$src1 = $dst" in
 defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
 
+def : Pat<(ffloor FR32:$src),
+          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+def : Pat<(f64 (ffloor FR64:$src)),
+          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+def : Pat<(f32 (fnearbyint FR32:$src)),
+          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+def : Pat<(f64 (fnearbyint FR64:$src)),
+          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+def : Pat<(f32 (fceil FR32:$src)),
+          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+def : Pat<(f64 (fceil FR64:$src)),
+          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+def : Pat<(f32 (frint FR32:$src)),
+          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+def : Pat<(f64 (frint FR64:$src)),
+          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+def : Pat<(f32 (ftrunc FR32:$src)),
+          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+def : Pat<(f64 (ftrunc FR64:$src)),
+          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
 //===----------------------------------------------------------------------===//
@@ -6195,7 +6298,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set VR128:$dst,
                        (IntId128
-                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+                        (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
 }
 
 let Predicates = [HasAVX] in
@@ -6221,7 +6324,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
 }
 
 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
@@ -6237,7 +6340,7 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (memopv32i8 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
 }
 
 let Predicates = [HasAVX] in {
@@ -6400,38 +6503,38 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     let ExeDomain = SSEPackedSingle in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, memopv16i8, i128mem, 0>, VEX_4V;
+                                        VR128, memopv4f32, i128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-              int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
+              int_x86_avx_blend_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, memopv16i8, i128mem, 0>, VEX_4V;
+                                        VR128, memopv2f64, i128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-              int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
+              int_x86_avx_blend_pd_256, VR256, memopv4f64, i256mem, 0>, VEX_4V;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
-                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
+                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
   defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
+                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
   }
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, memopv16i8, i128mem, 0>, VEX_4V;
+                                   VR128, memopv4f32, i128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, memopv16i8, i128mem, 0>, VEX_4V;
+                                   VR128, memopv2f64, i128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                   VR256, memopv32i8, i256mem, 0>, VEX_4V;
+                                   VR256, memopv8f32, i256mem, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                       VR256, memopv32i8, i256mem, 0>, VEX_4V;
+                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                       VR256, memopv32i8, i256mem, 0>, VEX_4V;
+                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
   }
 }
 
@@ -6439,35 +6542,35 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
-                                     VR128, memopv16i8, i128mem>;
+                                     VR128, memopv4f32, i128mem>;
   let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
-                                     VR128, memopv16i8, i128mem>;
+                                     VR128, memopv2f64, i128mem>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
-                                     VR128, memopv16i8, i128mem>;
+                                     VR128, memopv2i64, i128mem>;
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv16i8, i128mem>;
+                                     VR128, memopv2i64, i128mem>;
   }
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv16i8, i128mem>;
+                                  VR128, memopv4f32, i128mem>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv16i8, i128mem>;
+                                  VR128, memopv2f64, i128mem>;
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
                                     PatFrag mem_frag, Intrinsic IntId> {
-  def rr : I<opc, MRMSrcReg, (outs RC:$dst),
+  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
                   (ins RC:$src1, RC:$src2, RC:$src3),
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
                   SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
 
-  def rm : I<opc, MRMSrcMem, (outs RC:$dst),
+  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
                   (ins RC:$src1, x86memop:$src2, RC:$src3),
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -6480,23 +6583,23 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
-                                           memopv16i8, int_x86_sse41_blendvpd>;
+                                           memopv2f64, int_x86_sse41_blendvpd>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
-                                         memopv32i8, int_x86_avx_blendv_pd_256>;
+                                         memopv4f64, int_x86_avx_blendv_pd_256>;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
-                                           memopv16i8, int_x86_sse41_blendvps>;
+                                           memopv4f32, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
-                                         memopv32i8, int_x86_avx_blendv_ps_256>;
+                                         memopv8f32, int_x86_avx_blendv_ps_256>;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           memopv16i8, int_x86_sse41_pblendvb>;
+                                           memopv2i64, int_x86_sse41_pblendvb>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                           memopv32i8, int_x86_avx2_pblendvb>;
+                                           memopv4i64, int_x86_avx2_pblendvb>;
 }
 
 let Predicates = [HasAVX] in {
@@ -6537,7 +6640,8 @@ let Predicates = [HasAVX2] in {
 
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
-  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                               Intrinsic IntId> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
@@ -6551,15 +6655,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
+                       (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize;
   }
 }
 
 let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64,
+                                  int_x86_sse41_blendvpd>;
 let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32,
+                                  int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64,
+                                  int_x86_sse41_pblendvb>;
 
 let Predicates = [HasSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
@@ -6614,8 +6721,7 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1,
-          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
 }
 
 /// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
@@ -6630,8 +6736,7 @@ multiclass SS42I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
-         (IntId256 VR256:$src1,
-          (bitconvert (memopv32i8 addr:$src2))))]>, OpSize;
+         (IntId256 VR256:$src1, (memopv4i64 addr:$src2)))]>, OpSize;
 }
 
 let Predicates = [HasAVX] in {
@@ -6913,7 +7018,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
 }
 
 // Perform One Round of an AES Encryption/Decryption Flow
@@ -7144,7 +7249,7 @@ def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
 //===----------------------------------------------------------------------===//
 // VINSERTF128 - Insert packed floating-point values
 //
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -7163,35 +7268,10 @@ def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
 def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
           (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
 
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-
 //===----------------------------------------------------------------------===//
 // VEXTRACTF128 - Extract packed floating-point values
 //
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -7210,31 +7290,6 @@ def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
 def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
           (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
 
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4f32 (VEXTRACTF128rr
-                    (v8f32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2f64 (VEXTRACTF128rr
-                    (v4f64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v8i16 (VEXTRACTF128rr
-                    (v16i16 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v16i8 (VEXTRACTF128rr
-                    (v32i8 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-
 //===----------------------------------------------------------------------===//
 // VMASKMOV - Conditional SIMD Packed Loads and Stores
 //
@@ -7288,7 +7343,8 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
   def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
              (ins RC:$src1, x86memop_i:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (IntVar RC:$src1, (i_frag addr:$src2)))]>, VEX_4V;
+             [(set RC:$dst, (IntVar RC:$src1,
+                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V;
 
   def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, i8imm:$src2),
@@ -7302,11 +7358,11 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               memopv4f32, memopv4i32,
+                               memopv4f32, memopv2i64,
                                int_x86_avx_vpermilvar_ps,
                                int_x86_avx_vpermil_ps>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                               memopv8f32, memopv8i32,
+                               memopv8f32, memopv4i64,
                                int_x86_avx_vpermilvar_ps_256,
                                int_x86_avx_vpermil_ps_256>;
 }
@@ -7321,19 +7377,28 @@ let ExeDomain = SSEPackedDouble in {
                                int_x86_avx_vpermil_pd_256>;
 }
 
-def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
+def : Pat<(v8f32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
-def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
+def : Pat<(v4f64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
+def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
+def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8f32 (X86VPermilp (memopv8f32 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4f64 (X86VPermilp (memopv4f64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDYmi addr:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
+                               (i8 imm:$imm))),
+          (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDYmi addr:$src1, imm:$imm)>;
 
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -7359,22 +7424,9 @@ def : Pat<(int_x86_avx_vperm2f128_pd_256
                   VR256:$src1, (memopv4f64 addr:$src2), imm:$src3),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
 def : Pat<(int_x86_avx_vperm2f128_si_256
-                  VR256:$src1, (memopv8i32 addr:$src2), imm:$src3),
+                  VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), imm:$src3),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
 
-def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-
 //===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
 //
@@ -7451,9 +7503,9 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
 
 let isCommutable = 0 in {
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
-                                   VR128, memopv16i8, i128mem>;
+                                   VR128, memopv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
-                                    VR256, memopv32i8, i256mem>;
+                                    VR256, memopv4i64, i256mem>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7541,11 +7593,12 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    (ins VR256:$src1, i256mem:$src2),
                    !strconcat(OpcodeStr,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set VR256:$dst, (Int VR256:$src1, (mem_frag addr:$src2)))]>,
+                   [(set VR256:$dst, (Int VR256:$src1,
+                                      (bitconvert (mem_frag addr:$src2))))]>,
                    VEX_4V;
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", memopv8i32, int_x86_avx2_permd>;
+defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, int_x86_avx2_permd>;
 let ExeDomain = SSEPackedSingle in
 defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, int_x86_avx2_permps>;
 
@@ -7571,7 +7624,7 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>,
                              VEX_W;
 
 //===----------------------------------------------------------------------===//
-// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
@@ -7587,6 +7640,64 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
             imm:$src3))]>,
           VEX_4V;
 
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                   (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+// AVX1 patterns
+def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1,
+                  (memopv8f32 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+                  (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4i64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+                  (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                  (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+
+
 //===----------------------------------------------------------------------===//
 // VINSERTI128 - Insert packed integer values
 //
@@ -7603,6 +7714,51 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
             (int_x86_avx2_vinserti128 VR256:$src1, (memopv2i64 addr:$src2),
              imm:$src3))]>, VEX_4V;
 
+let Predicates = [HasAVX2] in {
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+// AVX1 patterns
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
 //===----------------------------------------------------------------------===//
 // VEXTRACTI128 - Extract packed integer values
 //
@@ -7617,6 +7773,51 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX;
 
+let Predicates = [HasAVX2] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2i64 (VEXTRACTI128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4i32 (VEXTRACTI128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v8i16 (VEXTRACTI128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v16i8 (VEXTRACTI128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+}
+
+// AVX1 patterns
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4f32 (VEXTRACTF128rr
+                    (v8f32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2f64 (VEXTRACTF128rr
+                    (v4f64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2i64 (VEXTRACTF128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4i32 (VEXTRACTF128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v8i16 (VEXTRACTF128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v16i8 (VEXTRACTF128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
 //===----------------------------------------------------------------------===//
 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
 //
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 0000000..64cc44d
--- /dev/null
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,243 @@
+//====- X86InstrXOP.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           []>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           []>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", f128mem>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", f128mem>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", f128mem>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", f128mem>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", f128mem>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", f128mem>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", f128mem>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", f128mem>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", f128mem>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", f128mem>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", f128mem>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", f128mem>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", f128mem>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", f128mem>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", f128mem>;
+  defm VFRCZSS   : xop2op<0x82, "vfrczss", f32mem>;
+  defm VFRCZSD   : xop2op<0x83, "vfrczsd", f64mem>;
+  defm VFRCZPS   : xop2op<0x80, "vfrczps", f128mem>;
+  defm VFRCZPD   : xop2op<0x81, "vfrczpd", f128mem>;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr> {
+  def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           []>, VEX, VEX_L;
+  def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           []>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VFRCZPS : xop2op256<0x80, "vfrczps">;
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd">;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>, VEX_4VOp3;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>, VEX_4V, VEX_W;
+  def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins f128mem:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>, VEX_4VOp3;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPSHLW : xop3op<0x95, "vpshlw">;
+  defm VPSHLQ : xop3op<0x97, "vpshlq">;
+  defm VPSHLD : xop3op<0x96, "vpshld">;
+  defm VPSHLB : xop3op<0x94, "vpshlb">;
+  defm VPSHAW : xop3op<0x99, "vpshaw">;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq">;
+  defm VPSHAD : xop3op<0x9A, "vpshad">;
+  defm VPSHAB : xop3op<0x98, "vpshab">;
+  defm VPROTW : xop3op<0x91, "vprotw">;
+  defm VPROTQ : xop3op<0x93, "vprotq">;
+  defm VPROTD : xop3op<0x92, "vprotd">;
+  defm VPROTB : xop3op<0x90, "vprotb">;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>, VEX;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins f128mem:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPROTW : xop3opimm<0xC1, "vprotw">;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq">;
+  defm VPROTD : xop3opimm<0xC2, "vprotd">;
+  defm VPROTB : xop3opimm<0xC0, "vprotb">;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr> {
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd">;
+  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd">;
+  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww">;
+  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd">;
+  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww">;
+  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd">;
+  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql">;
+  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh">;
+  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd">;
+  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql">;
+  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh">;
+  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd">;
+}
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xop4opimm<bits<8> opc, string OpcodeStr> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, i8imm:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPCOMW  : xop4opimm<0xCD, "vpcomw">;
+  defm VPCOMUW : xop4opimm<0xED, "vpcomuw">;
+  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq">;
+  defm VPCOMUD : xop4opimm<0xEE, "vpcomud">;
+  defm VPCOMUB : xop4opimm<0xEC, "vpcomub">;
+  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq">;
+  defm VPCOMD  : xop4opimm<0xCE, "vpcomd">;
+  defm VPCOMB  : xop4opimm<0xCC, "vpcomb">;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op<bits<8> opc, string OpcodeStr> {
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM, XOP_W;
+  def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPPERM : xop4op<0xA3, "vpperm">;
+  defm VPCMOV : xop4op<0xA2, "vpcmov">;
+}
+
+multiclass xop4op256<bits<8> opc, string OpcodeStr> {
+  def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM;
+  def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM, XOP_W;
+  def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           []>, VEX_4V, VEX_I8IMM;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPCMOV : xop4op256<0xA2, "vpcmov">;
+}
+
+multiclass xop5op<bits<8> opc, string OpcodeStr> {
+  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>;
+  def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>, XOP_W;
+  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>;
+  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>;
+  def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>, XOP_W;
+  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd">;
+  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps">;
+}
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 3f88fa6..2145a33 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -424,7 +424,9 @@ X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
 
 TargetJITInfo::LazyResolverFn
 X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  TsanIgnoreWritesBegin();
   JITCompilerFunction = F;
+  TsanIgnoreWritesEnd();
 
 #if defined (X86_32_JIT) && !defined (_MSC_VER)
   if (Subtarget->hasSSE1())
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 81ee665..9232196 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -368,10 +368,6 @@ ReSimplify:
   case X86::SETB_C64r:    LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
   case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
-  case X86::FsFLD0SS:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
-  case X86::FsFLD0SD:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
-  case X86::VFsFLD0SS:     LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
-  case X86::VFsFLD0SD:     LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
   case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
   case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
   case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index c1ac9f3..4e80432 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -452,7 +452,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return (RealignStack &&
+  return (MF.getTarget().Options.RealignStack &&
           !MFI->hasVarSizedObjects());
 }
 
@@ -583,7 +583,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // sure we restore the stack pointer immediately after the call, there may
     // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
     MachineBasicBlock::iterator B = MBB.begin();
-    while (I != B && !llvm::prior(I)->getDesc().isCall())
+    while (I != B && !llvm::prior(I)->isCall())
       --I;
     MBB.insert(I, New);
   }
@@ -665,7 +665,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
   case MVT::i8:
     if (High) {
       switch (Reg) {
-      default: return 0;
+      default: return getX86SubSuperRegister(Reg, MVT::i64, High);
       case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
         return X86::AH;
       case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -785,6 +785,22 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
       return X86::R15D;
     }
   case MVT::i64:
+    // For 64-bit mode if we've requested a "high" register and the
+    // Q or r constraints we want one of these high registers or
+    // just the register name otherwise.
+    if (High) {
+      switch (Reg) {
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SI;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DI;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BP;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SP;
+      // Fallthrough.
+      }
+    }
     switch (Reg) {
     default: return Reg;
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index e7bcbf8..6e092c7 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -273,6 +273,8 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
       if (IsAMD && ((ECX >> 16) & 0x1)) {
         HasFMA4 = true;
         ToggleFeature(X86::FeatureFMA4);
+        HasXOP = true;
+        ToggleFeature(X86::FeatureXOP);
       }
     }
   }
@@ -317,6 +319,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasCLMUL(false)
   , HasFMA3(false)
   , HasFMA4(false)
+  , HasXOP(false)
   , HasMOVBE(false)
   , HasRDRAND(false)
   , HasF16C(false)
@@ -387,9 +390,6 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
-  if(EnableSegmentedStacks && !isTargetELF())
-    report_fatal_error("Segmented stacks are only implemented on ELF.");
-
   // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both
   // 32 and 64 bit) and for all 64-bit targets.
   if (StackAlignOverride)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index e93f8e9..ccb9be0 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -93,6 +93,9 @@ protected:
   /// HasFMA4 - Target has 4-operand fused multiply-add
   bool HasFMA4;
 
+  /// HasXOP - Target has XOP instructions
+  bool HasXOP;
+
   /// HasMOVBE - True if the processor has the MOVBE instruction.
   bool HasMOVBE;
 
@@ -198,6 +201,7 @@ public:
   bool hasCLMUL() const { return HasCLMUL; }
   bool hasFMA3() const { return HasFMA3; }
   bool hasFMA4() const { return HasFMA4; }
+  bool hasXOP() const { return HasXOP; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
   bool hasF16C() const { return HasF16C; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 1c9f3bd..126042e 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -31,9 +31,10 @@ extern "C" void LLVMInitializeX86Target() {
 
 X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
                                          StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
-  : X86TargetMachine(T, TT, CPU, FS, RM, CM, OL, false),
+  : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false),
     DataLayout(getSubtargetImpl()->isTargetDarwin() ?
                "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
                "n8:16:32-S128" :
@@ -52,9 +53,10 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
 
 X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
                                          StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
-  : X86TargetMachine(T, TT, CPU, FS, RM, CM, OL, true),
+  : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
     DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
                "n8:16:32:64-S128"),
     InstrInfo(*this),
@@ -67,11 +69,12 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
 ///
 X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
-    Subtarget(TT, CPU, FS, StackAlignmentOverride, is64Bit),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit),
     FrameLowering(*this, Subtarget),
     ELFWriterInfo(is64Bit, true) {
   // Determine the PICStyle based on the target selected.
@@ -95,8 +98,11 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   }
 
   // default to hard float ABI
-  if (FloatABIType == FloatABI::Default)
-    FloatABIType = FloatABI::Hard;    
+  if (Options.FloatABIType == FloatABI::Default)
+    this->Options.FloatABIType = FloatABI::Hard;   
+
+  if (Options.EnableSegmentedStacks && !Subtarget.isTargetELF())
+    report_fatal_error("Segmented stacks are only implemented on ELF.");
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 64be458..3ac1769 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -38,7 +38,7 @@ class X86TargetMachine : public LLVMTargetMachine {
 
 public:
   X86TargetMachine(const Target &T, StringRef TT, 
-                   StringRef CPU, StringRef FS,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL,
                    bool is64Bit);
@@ -85,7 +85,7 @@ class X86_32TargetMachine : public X86TargetMachine {
   X86JITInfo        JITInfo;
 public:
   X86_32TargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
@@ -113,7 +113,7 @@ class X86_64TargetMachine : public X86TargetMachine {
   X86JITInfo        JITInfo;
 public:
   X86_64TargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 9bb54a8..f8c30eb 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -220,7 +220,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
     MachineInstr *MI = I;
     DebugLoc dl = I->getDebugLoc();
-    bool isControlFlow = MI->getDesc().isCall() || MI->getDesc().isReturn();
+    bool isControlFlow = MI->isCall() || MI->isReturn();
 
     // Shortcut: don't need to check regular instructions in dirty state. 
     if (!isControlFlow && CurState == ST_DIRTY)
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index d91da8c..de4abfc 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -21,17 +21,5 @@ add_llvm_target(XCoreCodeGen
   XCoreSelectionDAGInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMXCoreCodeGen
-  LLVMAsmPrinter
-  LLVMCodeGen
-  LLVMCore
-  LLVMMC
-  LLVMSelectionDAG
-  LLVMSupport
-  LLVMTarget
-  LLVMXCoreDesc
-  LLVMXCoreInfo
-  )
-
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/XCore/LLVMBuild.txt b/lib/Target/XCore/LLVMBuild.txt
index 1f7e2d5..53b4a9e 100644
--- a/lib/Target/XCore/LLVMBuild.txt
+++ b/lib/Target/XCore/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = MCTargetDesc TargetInfo
+
 [component_0]
 type = TargetGroup
 name = XCore
@@ -27,4 +30,3 @@ name = XCoreCodeGen
 parent = XCore
 required_libraries = AsmPrinter CodeGen Core MC SelectionDAG Support Target XCoreDesc XCoreInfo
 add_to_library_groups = XCore
-
diff --git a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
index 269822d..3a3f5b4 100644
--- a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
@@ -3,11 +3,6 @@ add_llvm_library(LLVMXCoreDesc
   XCoreMCAsmInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMXCoreDesc
-  LLVMMC
-  LLVMXCoreInfo
-  )
-
 add_dependencies(LLVMXCoreDesc XCoreCommonTableGen)
 
 # Hack: we need to include 'main' target directory to grab private headers
diff --git a/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt b/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
index 628afb5..a80c939 100644
--- a/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = XCoreDesc
 parent = XCore
 required_libraries = MC XCoreInfo
 add_to_library_groups = XCore
-
diff --git a/lib/Target/XCore/TargetInfo/CMakeLists.txt b/lib/Target/XCore/TargetInfo/CMakeLists.txt
index 7f84f69..2c34b87 100644
--- a/lib/Target/XCore/TargetInfo/CMakeLists.txt
+++ b/lib/Target/XCore/TargetInfo/CMakeLists.txt
@@ -4,10 +4,4 @@ add_llvm_library(LLVMXCoreInfo
   XCoreTargetInfo.cpp
   )
 
-add_llvm_library_dependencies(LLVMXCoreInfo
-  LLVMMC
-  LLVMSupport
-  LLVMTarget
-  )
-
 add_dependencies(LLVMXCoreInfo XCoreCommonTableGen)
diff --git a/lib/Target/XCore/TargetInfo/LLVMBuild.txt b/lib/Target/XCore/TargetInfo/LLVMBuild.txt
index d0b8e54..770ba87 100644
--- a/lib/Target/XCore/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/XCore/TargetInfo/LLVMBuild.txt
@@ -21,4 +21,3 @@ name = XCoreInfo
 parent = XCore
 required_libraries = MC Support Target
 add_to_library_groups = XCore
-
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index b8fb0ca..08f091e 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -24,7 +24,8 @@ namespace llvm {
   class XCoreTargetMachine;
   class formatted_raw_ostream;
 
-  FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM);
+  FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
 
 } // end namespace llvm;
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 7f8b169..5007d04 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -84,7 +84,8 @@ XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
 }
 
 bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const {
-  return DisableFramePointerElim(MF) || MF.getFrameInfo()->hasVarSizedObjects();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MF.getFrameInfo()->hasVarSizedObjects();
 }
 
 void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 8d746ae..7564fba 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -41,8 +41,8 @@ namespace {
     const XCoreSubtarget &Subtarget;
 
   public:
-    XCoreDAGToDAGISel(XCoreTargetMachine &TM)
-      : SelectionDAGISel(TM),
+    XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
         Lowering(*TM.getTargetLowering()), 
         Subtarget(*TM.getSubtargetImpl()) { }
 
@@ -83,8 +83,9 @@ namespace {
 /// createXCoreISelDag - This pass converts a legalized DAG into a 
 /// XCore-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM) {
-  return new XCoreDAGToDAGISel(TM);
+FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel) {
+  return new XCoreDAGToDAGISel(TM, OptLevel);
 }
 
 bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index d791daa..c5c668e 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -109,6 +109,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index eec3674..7e1e035 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -21,9 +21,10 @@ using namespace llvm;
 ///
 XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
                "i16:16:32-i32:32:32-i64:32:32-n32"),
@@ -34,7 +35,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
 }
 
 bool XCoreTargetMachine::addInstSelector(PassManagerBase &PM) {
-  PM.add(createXCoreISelDag(*this));
+  PM.add(createXCoreISelDag(*this, getOptLevel()));
   return false;
 }
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 3f2644d..0159b1e 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -33,7 +33,7 @@ class XCoreTargetMachine : public LLVMTargetMachine {
   XCoreSelectionDAGInfo TSInfo;
 public:
   XCoreTargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);