am 1c4ad5ef: Merge branch \'upstream\' into merge-2012_09_10

* commit '1c4ad5ef4fab105f0c8af7edd026e00502fb6279': (446 commits) Revert r163556. Missed updates to tablegen files. Update function names to conform to guidelines. No functional change intended. test/CodeGen/X86/ms-inline-asm.ll: Relax for non-darwin x86 targets. '##InlineAsm' could not be seen in other hosts. [ms-inline asm] Properly emit the asm directives when the AsmPrinterVariant and InlineAsmVariant don't match. Update test case for Release builds. Remove redundant semicolons which are null statements. Disable stack coloring because it makes dragonegg fail bootstrapping. [ms-inline asm] Pass the correct AsmVariant to the PrintAsmOperand() function and update the printOperand() function accordingly. [ms-inline asm] Add support for .att_syntax directive. Enable stack coloring. Don't attempt to use flags from predicated instructions. [Object] Extract Elf_Ehdr. Patch by Hemant Kulkarni! Stack Coloring: Handle the case where END markers come before BEGIN markers properly. Enhance PR11334 fix to support extload from v2f32/v4f32 Add "blocked" heuristic to the Hexagon MI scheduler. Fold multiply by 0 or 1 when in UnsafeFPMath mode in SelectionDAG::getNode(). whitespace Add boolean simplification support from CMOV Fix an assertion failure when optimising a shufflevector incorrectly into concat_vectors, and a followup bug with SelectionDAG::getNode() creating nodes with invalid types. Minor cleanup. No functional change. ...
author: Stephen Hines <srhines@google.com> 2012-09-13 19:09:19 -0700
committer: Android Git Automerger <android-git-automerger@android.com> 2012-09-13 19:09:19 -0700
commit: 78c041bd883d86c81c42b98f326660277e6d0d9a (patch)
tree: 52800183ec2d22164b8f396842142c3a8aab912a /lib/Target
parent: 828ded66831c0caaeecd2291a6bfb084f373d0e4 (diff)
parent: 1c4ad5ef4fab105f0c8af7edd026e00502fb6279 (diff)
download: external_llvm-78c041bd883d86c81c42b98f326660277e6d0d9a.zip
external_llvm-78c041bd883d86c81c42b98f326660277e6d0d9a.tar.gz
external_llvm-78c041bd883d86c81c42b98f326660277e6d0d9a.tar.bz2
119 files changed, 5857 insertions, 1726 deletions
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 29033e5..e2f0d7d 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -683,7 +683,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Handle register classes that require multiple instructions.
   unsigned BeginIdx = 0;
   unsigned SubRegs = 0;
-  unsigned Spacing = 1;
+  int Spacing = 1;
 
   // Use VORRq when possible.
   if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
@@ -705,27 +705,38 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
 
-  if (Opc) {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    MachineInstrBuilder Mov;
-    for (unsigned i = 0; i != SubRegs; ++i) {
-      unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
-      unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
-      assert(Dst && Src && "Bad sub-register");
-      Mov = AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-                             .addReg(Src));
-      // VORR takes two source operands.
-      if (Opc == ARM::VORRq)
-        Mov.addReg(Src);
-    }
-    // Add implicit super-register defs and kills to the last instruction.
-    Mov->addRegisterDefined(DestReg, TRI);
-    if (KillSrc)
-      Mov->addRegisterKilled(SrcReg, TRI);
-    return;
-  }
+  assert(Opc && "Impossible reg-to-reg copy");
 
-  llvm_unreachable("Impossible reg-to-reg copy");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstrBuilder Mov;
+
+  // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
+  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+    BeginIdx = BeginIdx + ((SubRegs-1)*Spacing);
+    Spacing = -Spacing;
+  }
+#ifndef NDEBUG
+  SmallSet<unsigned, 4> DstRegs;
+#endif
+  for (unsigned i = 0; i != SubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+    assert(Dst && Src && "Bad sub-register");
+#ifndef NDEBUG
+    assert(!DstRegs.count(Src) && "destructive vector copy");
+    DstRegs.insert(Dst);
+#endif
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
+      .addReg(Src);
+    // VORR takes two source operands.
+    if (Opc == ARM::VORRq)
+      Mov.addReg(Src);
+    Mov = AddDefaultPred(Mov);
+  }
+  // Add implicit super-register defs and kills to the last instruction.
+  Mov->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    Mov->addRegisterKilled(SrcReg, TRI);
 }
 
 static const
@@ -1569,16 +1580,20 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 }
 
 /// Identify instructions that can be folded into a MOVCC instruction, and
-/// return the corresponding opcode for the predicated pseudo-instruction.
-static unsigned canFoldIntoMOVCC(unsigned Reg, MachineInstr *&MI,
-                                 const MachineRegisterInfo &MRI) {
+/// return the defining instruction.
+static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      const TargetInstrInfo *TII) {
   if (!TargetRegisterInfo::isVirtualRegister(Reg))
     return 0;
   if (!MRI.hasOneNonDBGUse(Reg))
     return 0;
-  MI = MRI.getVRegDef(Reg);
+  MachineInstr *MI = MRI.getVRegDef(Reg);
   if (!MI)
     return 0;
+  // MI is folded into the MOVCC by predicating it.
+  if (!MI->isPredicable())
+    return 0;
   // Check if MI has any non-dead defs or physreg uses. This also detects
   // predicated instructions which will be reading CPSR.
   for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
@@ -1588,55 +1603,18 @@ static unsigned canFoldIntoMOVCC(unsigned Reg, MachineInstr *&MI,
       return 0;
     if (!MO.isReg())
       continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return 0;
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
       return 0;
     if (MO.isDef() && !MO.isDead())
       return 0;
   }
-  switch (MI->getOpcode()) {
-  default: return 0;
-  case ARM::ANDri:   return ARM::ANDCCri;
-  case ARM::ANDrr:   return ARM::ANDCCrr;
-  case ARM::ANDrsi:  return ARM::ANDCCrsi;
-  case ARM::ANDrsr:  return ARM::ANDCCrsr;
-  case ARM::t2ANDri: return ARM::t2ANDCCri;
-  case ARM::t2ANDrr: return ARM::t2ANDCCrr;
-  case ARM::t2ANDrs: return ARM::t2ANDCCrs;
-  case ARM::EORri:   return ARM::EORCCri;
-  case ARM::EORrr:   return ARM::EORCCrr;
-  case ARM::EORrsi:  return ARM::EORCCrsi;
-  case ARM::EORrsr:  return ARM::EORCCrsr;
-  case ARM::t2EORri: return ARM::t2EORCCri;
-  case ARM::t2EORrr: return ARM::t2EORCCrr;
-  case ARM::t2EORrs: return ARM::t2EORCCrs;
-  case ARM::ORRri:   return ARM::ORRCCri;
-  case ARM::ORRrr:   return ARM::ORRCCrr;
-  case ARM::ORRrsi:  return ARM::ORRCCrsi;
-  case ARM::ORRrsr:  return ARM::ORRCCrsr;
-  case ARM::t2ORRri: return ARM::t2ORRCCri;
-  case ARM::t2ORRrr: return ARM::t2ORRCCrr;
-  case ARM::t2ORRrs: return ARM::t2ORRCCrs;
-
-  // ARM ADD/SUB
-  case ARM::ADDri:   return ARM::ADDCCri;
-  case ARM::ADDrr:   return ARM::ADDCCrr;
-  case ARM::ADDrsi:  return ARM::ADDCCrsi;
-  case ARM::ADDrsr:  return ARM::ADDCCrsr;
-  case ARM::SUBri:   return ARM::SUBCCri;
-  case ARM::SUBrr:   return ARM::SUBCCrr;
-  case ARM::SUBrsi:  return ARM::SUBCCrsi;
-  case ARM::SUBrsr:  return ARM::SUBCCrsr;
-
-  // Thumb2 ADD/SUB
-  case ARM::t2ADDri:   return ARM::t2ADDCCri;
-  case ARM::t2ADDri12: return ARM::t2ADDCCri12;
-  case ARM::t2ADDrr:   return ARM::t2ADDCCrr;
-  case ARM::t2ADDrs:   return ARM::t2ADDCCrs;
-  case ARM::t2SUBri:   return ARM::t2SUBCCri;
-  case ARM::t2SUBri12: return ARM::t2SUBCCri12;
-  case ARM::t2SUBrr:   return ARM::t2SUBCCrr;
-  case ARM::t2SUBrs:   return ARM::t2SUBCCrs;
-  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ 0, DontMoveAcrossStores))
+    return 0;
+  return MI;
 }
 
 bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
@@ -1665,19 +1643,18 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  MachineInstr *DefMI = 0;
-  unsigned Opc = canFoldIntoMOVCC(MI->getOperand(2).getReg(), DefMI, MRI);
-  bool Invert = !Opc;
-  if (!Opc)
-    Opc = canFoldIntoMOVCC(MI->getOperand(1).getReg(), DefMI, MRI);
-  if (!Opc)
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
+  if (!DefMI)
     return 0;
 
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
   MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      get(Opc), MI->getOperand(0).getReg())
-    .addOperand(MI->getOperand(Invert ? 2 : 1));
+                                      DefMI->getDesc(),
+                                      MI->getOperand(0).getReg());
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1696,6 +1673,15 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   if (NewMI->hasOptionalDef())
     AddDefaultCC(NewMI);
 
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.
+  // The tie makes the register allocator ensure the FalseReg is allocated the
+  // same register as operand 0.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  FalseReg.setImplicit();
+  NewMI->addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
   // The caller will erase MI, but not DefMI.
   DefMI->eraseFromParent();
   return NewMI;
@@ -2042,13 +2028,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
-    if (!isSuitableForMask(MI, SrcReg, CmpMask, false)) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
       MI = 0;
       for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg),
            UE = MRI->use_end(); UI != UE; ++UI) {
         if (UI->getParent() != CmpInstr->getParent()) continue;
         MachineInstr *PotentialAND = &*UI;
-        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true))
+        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
+            isPredicated(PotentialAND))
           continue;
         MI = PotentialAND;
         break;
@@ -2114,6 +2101,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // The single candidate is called MI.
   if (!MI) MI = Sub;
 
+  // We can't use a predicated instruction - it doesn't always write the flags.
+  if (isPredicated(MI))
+    return false;
+
   switch (MI->getOpcode()) {
   default: break;
   case ARM::RSBrr:
@@ -2220,6 +2211,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
+    assert(!isPredicated(MI) && "Can't use flags from predicated instruction");
     CmpInstr->eraseFromParent();
 
     // Modify the condition code of operands in OperandsToUpdate.
@@ -3366,7 +3358,8 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   // converted.
   if (Subtarget.isCortexA9() && !isPredicated(MI) &&
       (MI->getOpcode() == ARM::VMOVRS ||
-       MI->getOpcode() == ARM::VMOVSR))
+       MI->getOpcode() == ARM::VMOVSR ||
+       MI->getOpcode() == ARM::VMOVS))
     return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
 
   // No other instructions can be swizzled, so just determine their domain.
@@ -3386,13 +3379,28 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   return std::make_pair(ExeGeneric, 0);
 }
 
+static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+                                            unsigned SReg, unsigned &Lane) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+  Lane = 0;
+
+  if (DReg != ARM::NoRegister)
+   return DReg;
+
+  Lane = 1;
+  DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+
+  assert(DReg && "S-register with no D super-register?");
+  return DReg;
+}
+
+
 void
 ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   unsigned DstReg, SrcReg, DReg;
   unsigned Lane;
   MachineInstrBuilder MIB(MI);
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  bool isKill;
   switch (MI->getOpcode()) {
     default:
       llvm_unreachable("cannot handle opcode!");
@@ -3403,78 +3411,175 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
 
       // Zap the predicate operands.
       assert(!isPredicated(MI) && "Cannot predicate a VORRd");
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
 
-      // Change to a VORRd which requires two identical use operands.
-      MI->setDesc(get(ARM::VORRd));
+      // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
 
-      // Add the extra source operand and new predicates.
-      // This will go before any implicit ops.
-      AddDefaultPred(MachineInstrBuilder(MI).addOperand(MI->getOperand(1)));
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+      MI->setDesc(get(ARM::VORRd));
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(SrcReg)
+                        .addReg(SrcReg));
       break;
     case ARM::VMOVRS:
       if (Domain != ExeNEON)
         break;
       assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
 
+      // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
       DstReg = MI->getOperand(0).getReg();
       SrcReg = MI->getOperand(1).getReg();
 
-      DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_0, &ARM::DPRRegClass);
-      Lane = 0;
-      if (DReg == ARM::NoRegister) {
-        DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_1, &ARM::DPRRegClass);
-        Lane = 1;
-        assert(DReg && "S-register with no D super-register?");
-      }
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
 
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
-      MI->RemoveOperand(1);
+      DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
 
+      // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+      // Note that DSrc has been widened and the other lane may be undef, which
+      // contaminates the entire register.
       MI->setDesc(get(ARM::VGETLNi32));
-      MIB.addReg(DReg);
-      MIB.addImm(Lane);
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(DReg, RegState::Undef)
+                        .addImm(Lane));
 
-      MIB->getOperand(1).setIsUndef();
+      // The old source should be an implicit use, otherwise we might think it
+      // was dead before here.
       MIB.addReg(SrcReg, RegState::Implicit);
-
-      AddDefaultPred(MIB);
       break;
     case ARM::VMOVSR:
       if (Domain != ExeNEON)
         break;
       assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
 
+      // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
       DstReg = MI->getOperand(0).getReg();
       SrcReg = MI->getOperand(1).getReg();
-      DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_0, &ARM::DPRRegClass);
-      Lane = 0;
-      if (DReg == ARM::NoRegister) {
-        DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_1, &ARM::DPRRegClass);
-        Lane = 1;
-        assert(DReg && "S-register with no D super-register?");
-      }
-      isKill = MI->getOperand(0).isKill();
 
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
-      MI->RemoveOperand(1);
-      MI->RemoveOperand(0);
+      DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
 
+      // If we insert both a novel <def> and an <undef> on the DReg, we break
+      // any existing dependency chain on the unused lane. Either already being
+      // present means this instruction is in that chain anyway so we can make
+      // the transformation.
+      if (!MI->definesRegister(DReg, TRI) && !MI->readsRegister(DReg, TRI))
+          break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+      // Again DDst may be undefined at the beginning of this instruction.
       MI->setDesc(get(ARM::VSETLNi32));
-      MIB.addReg(DReg, RegState::Define);
-      MIB.addReg(DReg, RegState::Undef);
-      MIB.addReg(SrcReg);
-      MIB.addImm(Lane);
+      MIB.addReg(DReg, RegState::Define)
+         .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI)))
+         .addReg(SrcReg)
+         .addImm(Lane);
+      AddDefaultPred(MIB);
 
-      if (isKill)
-        MIB->addRegisterKilled(DstReg, TRI, true);
-      MIB->addRegisterDefined(DstReg, TRI);
+      // The narrower destination must be marked as set to keep previous chains
+      // in place.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      break;
+    case ARM::VMOVS: {
+      if (Domain != ExeNEON)
+        break;
 
+      // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+      DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
+
+      // If we insert both a novel <def> and an <undef> on the DReg, we break
+      // any existing dependency chain on the unused lane. Either already being
+      // present means this instruction is in that chain anyway so we can make
+      // the transformation.
+      if (!MI->definesRegister(DDst, TRI) && !MI->readsRegister(DDst, TRI))
+          break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      if (DSrc == DDst) {
+        // Destination can be:
+        //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+        MI->setDesc(get(ARM::VDUPLN32d));
+        MIB.addReg(DDst, RegState::Define)
+           .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI)))
+           .addImm(SrcLane);
+        AddDefaultPred(MIB);
+
+        // Neither the source or the destination are naturally represented any
+        // more, so add them in manually.
+        MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+        MIB.addReg(SrcReg, RegState::Implicit);
+        break;
+      }
+
+      // In general there's no single instruction that can perform an S <-> S
+      // move in NEON space, but a pair of VEXT instructions *can* do the
+      // job. It turns out that the VEXTs needed will only use DSrc once, with
+      // the position based purely on the combination of lane-0 and lane-1
+      // involved. For example
+      //     vmov s0, s2 -> vext.32 d0, d0, d1, #1  vext.32 d0, d0, d0, #1
+      //     vmov s1, s3 -> vext.32 d0, d1, d0, #1  vext.32 d0, d0, d0, #1
+      //     vmov s0, s3 -> vext.32 d0, d0, d0, #1  vext.32 d0, d1, d0, #1
+      //     vmov s1, s2 -> vext.32 d0, d0, d0, #1  vext.32 d0, d0, d1, #1
+      //
+      // Pattern of the MachineInstrs is:
+      //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+      MachineInstrBuilder NewMIB;
+      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                       get(ARM::VEXTd32), DDst);
+
+      // On the first instruction, both DSrc and DDst may be <undef> if present.
+      // Specifically when the original instruction didn't have them as an
+      // <imp-use>.
+      unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+      bool CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      NewMIB.addImm(1);
+      AddDefaultPred(NewMIB);
+
+      if (SrcLane == DstLane)
+        NewMIB.addReg(SrcReg, RegState::Implicit);
+
+      MI->setDesc(get(ARM::VEXTd32));
+      MIB.addReg(DDst, RegState::Define);
+
+      // On the second instruction, DDst has definitely been defined above, so
+      // it is not <undef>. DSrc, if present, can be <undef> as above.
+      CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      MIB.addImm(1);
       AddDefaultPred(MIB);
+
+      if (SrcLane != DstLane)
+        MIB.addReg(SrcReg, RegState::Implicit);
+
+      // As before, the original destination is no longer represented, add it
+      // implicitly.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
       break;
+    }
   }
 
 }
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 132b81f..89cbc84 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -410,7 +410,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
 
   do {
     DEBUG(errs() << "JITTing function '"
-          << MF.getFunction()->getName() << "'\n");
+          << MF.getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index a953985..dd05f0c 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1388,10 +1388,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // If the original WaterList entry was "new water" on this iteration,
     // propagate that to the new island.  This is just keeping NewWaterList
     // updated to match the WaterList, which will be updated below.
-    if (NewWaterList.count(WaterBB)) {
-      NewWaterList.erase(WaterBB);
+    if (NewWaterList.erase(WaterBB))
       NewWaterList.insert(NewIsland);
-    }
+
     // The new CPE goes before the following block (NewMBB).
     NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 15bb32e..8ed6b75 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1208,6 +1208,57 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       ExpandLaneOp(MBBI);
       return true;
 
+    case ARM::VSETLNi8Q:
+    case ARM::VSETLNi16Q: {
+      // Expand VSETLNs acting on a Q register to equivalent VSETLNs acting
+      // on the respective D register.
+
+      unsigned QReg  = MI.getOperand(1).getReg();
+      unsigned QLane = MI.getOperand(3).getImm();
+
+      unsigned NewOpcode, DLane, DSubReg;
+      switch (Opcode) {
+      default: llvm_unreachable("Invalid opcode!");
+      case ARM::VSETLNi8Q:
+        // 4 possible 8-bit lanes per DPR:
+        NewOpcode = ARM::VSETLNi8;
+        DLane = QLane % 8;
+        DSubReg  = (QLane / 8) ? ARM::dsub_1 : ARM::dsub_0;
+        break;
+      case ARM::VSETLNi16Q:
+        // 4 possible 16-bit lanes per DPR.
+        NewOpcode = ARM::VSETLNi16;
+        DLane = QLane % 4;
+        DSubReg  = (QLane / 4) ? ARM::dsub_1 : ARM::dsub_0;
+        break;
+      }
+
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpcode));
+
+      unsigned DReg = TRI->getSubReg(QReg, DSubReg);
+
+      MIB.addReg(DReg, RegState::Define); // Output DPR
+      MIB.addReg(DReg);                   // Input DPR
+      MIB.addOperand(MI.getOperand(2));   // Input GPR
+      MIB.addImm(DLane);                  // Lane
+
+      // Add the predicate operands.
+      MIB.addOperand(MI.getOperand(4));
+      MIB.addOperand(MI.getOperand(5));
+
+      if (MI.getOperand(1).isKill()) // Add an implicit kill for the Q register.
+        MIB->addRegisterKilled(QReg, TRI, true);
+      // And an implicit def of the output register (which should always be the
+      // same as the input register).
+      MIB->addRegisterDefined(QReg, TRI);
+
+      TransferImpOps(MI, MIB, MIB);
+
+      MI.eraseFromParent();
+      return true;
+    }
+
     case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
     case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 5a5ca1b..045d904 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -617,10 +617,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
   if (VT != MVT::i32) return 0;
 
   Reloc::Model RelocM = TM.getRelocationModel();
-
-  // TODO: Need more magic for ARM PIC.
-  if (!isThumb2 && (RelocM == Reloc::PIC_)) return 0;
-
+  bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
 
   // Use movw+movt when possible, it avoids constant pool entries.
@@ -668,17 +665,30 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
         .addConstantPoolIndex(Idx);
       if (RelocM == Reloc::PIC_)
         MIB.addImm(Id);
+      AddOptionalDefs(MIB);
     } else {
       // The extra immediate is for addrmode2.
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
                     DestReg)
         .addConstantPoolIndex(Idx)
         .addImm(0);
+      AddOptionalDefs(MIB);
+
+      if (RelocM == Reloc::PIC_) {
+        unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
+        unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+
+        MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                          DL, TII.get(Opc), NewDestReg)
+                                  .addReg(DestReg)
+                                  .addImm(Id);
+        AddOptionalDefs(MIB);
+        return NewDestReg;
+      }
     }
-    AddOptionalDefs(MIB);
   }
 
-  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
+  if (IsIndirect) {
     MachineInstrBuilder MIB;
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb2)
@@ -2212,25 +2222,17 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
-  if (isThumb2) {
-    // Explicitly adding the predicate here.
+  // BL / BLX don't take a predicate, but tBL / tBLX do.
+  if (isThumb2)
     AddDefaultPred(MIB);
-    if (EnableARMLongCalls)
-      MIB.addReg(CalleeReg);
-    else
-      MIB.addExternalSymbol(TLI.getLibcallName(Call));
-  } else {
-    if (EnableARMLongCalls)
-      MIB.addReg(CalleeReg);
-    else
-      MIB.addExternalSymbol(TLI.getLibcallName(Call));
+  if (EnableARMLongCalls)
+    MIB.addReg(CalleeReg);
+  else
+    MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
-    // Explicitly adding the predicate here.
-    AddDefaultPred(MIB);
-  }
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2358,30 +2360,20 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   unsigned CallOpc = ARMSelectCallOp(UseReg);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
-  if(isThumb2) {
-    // Explicitly adding the predicate here.
-    AddDefaultPred(MIB);
-    if (UseReg)
-      MIB.addReg(CalleeReg);
-    else if (!IntrMemName)
-      MIB.addGlobalAddress(GV, 0, 0);
-    else
-      MIB.addExternalSymbol(IntrMemName, 0);
-  } else {
-    if (UseReg)
-      MIB.addReg(CalleeReg);
-    else if (!IntrMemName)
-      MIB.addGlobalAddress(GV, 0, 0);
-    else
-      MIB.addExternalSymbol(IntrMemName, 0);
 
-    // Explicitly adding the predicate here.
+  // ARM calls don't take a predicate, but tBL / tBLX do.
+  if(isThumb2)
     AddDefaultPred(MIB);
-  }
+  if (UseReg)
+    MIB.addReg(CalleeReg);
+  else if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2650,7 +2642,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
   unsigned Reg1 = getRegForValue(Src1Value);
   if (Reg1 == 0) return false;
 
-  unsigned Reg2;
+  unsigned Reg2 = 0;
   if (Opc == ARM::MOVsr) {
     Reg2 = getRegForValue(Src2Value);
     if (Reg2 == 0) return false;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c6f9d15..1406620 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2637,6 +2637,38 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                     dl, MVT::i32, MVT::i32, Ops, 5);
     }
   }
+  case ARMISD::UMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::UMLAL : ARM::UMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
+  case ARMISD::SMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::SMLAL : ARM::SMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
   case ISD::LOAD: {
     SDNode *ResNode = 0;
     if (Subtarget->isThumb() && Subtarget->hasThumb2())
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index df4039b..e51315e 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -514,6 +514,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 
     // Neon does not support some operations on v1i64 and v2i64 types.
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
@@ -566,6 +567,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     }
   }
 
+  // ARM and Thumb2 support UMLAL/SMLAL.
+  if (!Subtarget->isThumb1Only())
+    setTargetDAGCombine(ISD::ADDC);
+
+
   computeRegisterProperties();
 
   // ARM does not have f32 extending load.
@@ -791,12 +797,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::MUL);
-
-  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) {
-    setTargetDAGCombine(ISD::AND);
-    setTargetDAGCombine(ISD::OR);
-    setTargetDAGCombine(ISD::XOR);
-  }
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
 
   if (Subtarget->hasV6Ops())
     setTargetDAGCombine(ISD::SRL);
@@ -981,6 +984,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
+  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
@@ -4154,10 +4159,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   }
 
   // Scan through the operands to see if only one value is used.
+  //
+  // As an optimisation, even if more than one value is used it may be more
+  // profitable to splat with one value then change some lanes.
+  //
+  // Heuristically we decide to do this if the vector has a "dominant" value,
+  // defined as splatted to more than half of the lanes.
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
   bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
   SDValue Value;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
@@ -4168,13 +4184,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
-    if (!Value.getNode())
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+    
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
       Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
+    }
   }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
 
-  if (!Value.getNode())
+  if (ValueCounts.size() == 0)
     return DAG.getUNDEF(VT);
 
   if (isOnlyLowElement)
@@ -4184,9 +4208,34 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
   // i32 and try again.
-  if (usesOnlyOneValue && EltSize <= 32) {
-    if (!isConstant)
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  if (hasDominantValue && EltSize <= 32) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are VDUPing a value that comes directly from a vector, that will
+      // cause an unnecessary move to and from a GPR, where instead we could
+      // just use VDUPLANE.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT)
+        N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      else
+        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumElts; ++i)
@@ -4198,9 +4247,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
-    SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
-    if (Val.getNode())
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    if (usesOnlyOneValue) {
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (isConstant && Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 
+    }
   }
 
   // If all elements are constants and the case above didn't get hit, fall back
@@ -5418,7 +5469,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
@@ -5529,7 +5580,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = MRI.createVirtualRegister(TRC);
@@ -7193,6 +7244,154 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
   return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
 }
 
+static SDValue findMUL_LOHI(SDValue V) {
+  if (V->getOpcode() == ISD::UMUL_LOHI ||
+      V->getOpcode() == ISD::SMUL_LOHI)
+    return V;
+  return SDValue();
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb1Only()) return SDValue();
+
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  // Look for multiply add opportunities.
+  // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
+  // each add nodes consumes a value from ISD::UMUL_LOHI and there is
+  // a glue link from the first add to the second add.
+  // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
+  // a S/UMLAL instruction.
+  //          loAdd   UMUL_LOHI
+  //            \    / :lo    \ :hi
+  //             \  /          \          [no multiline comment]
+  //              ADDC         |  hiAdd
+  //                 \ :glue  /  /
+  //                  \      /  /
+  //                    ADDE
+  //
+  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  SDValue AddcOp0 = AddcNode->getOperand(0);
+  SDValue AddcOp1 = AddcNode->getOperand(1);
+
+  // Check if the two operands are from the same mul_lohi node.
+  if (AddcOp0.getNode() == AddcOp1.getNode())
+    return SDValue();
+
+  assert(AddcNode->getNumValues() == 2 &&
+         AddcNode->getValueType(0) == MVT::i32 &&
+         AddcNode->getValueType(1) == MVT::Glue &&
+         "Expect ADDC with two result values: i32, glue");
+
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (AddeNode == NULL)
+    return SDValue();
+
+  // Make sure it is really an ADDE.
+  if (AddeNode->getOpcode() != ISD::ADDE)
+    return SDValue();
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
+         "ADDE node has the wrong inputs");
+
+  // Check for the triangle shape.
+  SDValue AddeOp0 = AddeNode->getOperand(0);
+  SDValue AddeOp1 = AddeNode->getOperand(1);
+
+  // Make sure that the ADDE operands are not coming from the same node.
+  if (AddeOp0.getNode() == AddeOp1.getNode())
+    return SDValue();
+
+  // Find the MUL_LOHI node walking up ADDE's operands.
+  bool IsLeftOperandMUL = false;
+  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  if (MULOp == SDValue())
+   MULOp = findMUL_LOHI(AddeOp1);
+  else
+    IsLeftOperandMUL = true;
+  if (MULOp == SDValue())
+     return SDValue();
+
+  // Figure out the right opcode.
+  unsigned Opc = MULOp->getOpcode();
+  unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
+
+  // Figure out the high and low input values to the MLAL node.
+  SDValue* HiMul = &MULOp;
+  SDValue* HiAdd = NULL;
+  SDValue* LoMul = NULL;
+  SDValue* LowAdd = NULL;
+
+  if (IsLeftOperandMUL)
+    HiAdd = &AddeOp1;
+  else
+    HiAdd = &AddeOp0;
+
+
+  if (AddcOp0->getOpcode() == Opc) {
+    LoMul = &AddcOp0;
+    LowAdd = &AddcOp1;
+  }
+  if (AddcOp1->getOpcode() == Opc) {
+    LoMul = &AddcOp1;
+    LowAdd = &AddcOp0;
+  }
+
+  if (LoMul == NULL)
+    return SDValue();
+
+  if (LoMul->getNode() != HiMul->getNode())
+    return SDValue();
+
+  // Create the merged node.
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(LoMul->getOperand(0));
+  Ops.push_back(LoMul->getOperand(1));
+  Ops.push_back(*LowAdd);
+  Ops.push_back(*HiAdd);
+
+  SDValue MLALNode =  DAG.getNode(FinalOpc, AddcNode->getDebugLoc(),
+                                 DAG.getVTList(MVT::i32, MVT::i32),
+                                 &Ops[0], Ops.size());
+
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(MLALNode.getNode(), 1);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  SDValue LoMLALResult(MLALNode.getNode(), 0);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+/// PerformADDCCombine - Target-specific dag combine transform from
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
+static SDValue PerformADDCCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
+
+}
+
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
 /// called with the default operands, and if that fails, with commuted
@@ -8764,6 +8963,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
+  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 13b83de..2b8f382 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -173,6 +173,9 @@ namespace llvm {
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      UMLAL,        // 64bit Unsigned Accumulate Multiply
+      SMLAL,        // 64bit Signed Accumulate Multiply
+
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
       // operations, but for ARM some BUILD_VECTORs are legal as-is and their
@@ -257,6 +260,11 @@ namespace llvm {
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
+    virtual bool isSelectSupported(SelectSupportKind Kind) const {
+      // ARM does not support scalar condition selects on vectors.
+      return (Kind != ScalarCondVectorVal);
+    }
+
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
     virtual EVT getSetCCResultType(EVT VT) const;
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 992aba5..e23989e 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -83,6 +83,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                              SDTCisInt<0>,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
+
+def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+                                        SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >;
+def ARMUmlal         : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>;
+def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
@@ -90,9 +97,10 @@ def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
 
 def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
-                              [SDNPHasChain, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
-                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPSideEffect,
+                               SDNPOptInGlue, SDNPOutGlue]>;
 def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
                                 SDT_ARMStructByVal,
                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
@@ -148,14 +156,16 @@ def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
 
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
-                               SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>;
+                               SDT_ARMEH_SJLJ_Setjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
-                               SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>;
+                               SDT_ARMEH_SJLJ_Longjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
 
 def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
                                [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
 
@@ -275,7 +285,7 @@ def imm16_31 : ImmLeaf<i32, [{
 
 def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
 def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-    int64_t Value = -(int)N->getZExtValue();
+    unsigned Value = -(unsigned)N->getZExtValue();
     return Value && ARM_AM::getSOImmVal(Value) != -1;
   }], imm_neg_XFORM> {
   let ParserMatchClass = so_imm_neg_asmoperand;
@@ -1791,12 +1801,15 @@ def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
   let Inst{15-12} = Rd;
   let Inst{11-0} = label{11-0};
 }
+
+let hasSideEffects = 1 in {
 def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
                     4, IIC_iALUi, []>;
 
 def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
                       (ins i32imm:$label, nohash_imm:$id, pred:$p),
                       4, IIC_iALUi, []>;
+}
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
@@ -3399,6 +3412,18 @@ class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{11-8}  = Rm;
   let Inst{3-0}   = Rn;
 }
+class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
 
 // FIXME: The v5 pseudos are only necessary for the additional Constraint
 //        property. Remove them when it's possible to add those properties
@@ -3481,14 +3506,14 @@ def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 }
 
 // Multiply + accumulate
-def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
-def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
                                (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
@@ -3504,17 +3529,22 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
   let Inst{3-0}   = Rn;
 }
 
-let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+let Constraints = "$RLo = $RdLo,$RHi = $RdHi" in {
 def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
-                              (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
-          (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+             (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
                            Requires<[IsARM, NoV6]>;
 def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
-                              (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
-          (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+             (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
                            Requires<[IsARM, NoV6]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                               (ins GPR:$Rn, GPR:$Rm, pred:$p),
                               4, IIC_iMAC64, [],
@@ -3986,48 +4016,6 @@ def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
                 RegConstraint<"$false = $Rd">;
 
-// Conditional instructions
-multiclass AsI1_bincc_irs<Instruction iri, Instruction irr, Instruction irsi,
-                          Instruction irsr,
-                          InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis> {
-  def ri  : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, so_imm:$imm,
-                                 pred:$p, cc_out:$s),
-                            4, iii, [],
-                       (iri GPR:$Rd, GPR:$Rn, so_imm:$imm, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rr  : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, GPR:$Rm,
-                                 pred:$p, cc_out:$s),
-                            4, iir, [],
-                           (irr GPR:$Rd, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rsi : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, so_reg_imm:$shift,
-                                 pred:$p, cc_out:$s),
-                            4, iis, [],
-                (irsi GPR:$Rd, GPR:$Rn, so_reg_imm:$shift, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rsr : ARMPseudoExpand<(outs GPRnopc:$Rd),
-                           (ins GPRnopc:$Rfalse, GPRnopc:$Rn, so_reg_reg:$shift,
-                                pred:$p, cc_out:$s),
-                            4, iis, [],
-                (irsr GPR:$Rd, GPR:$Rn, so_reg_reg:$shift, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-}
-
-defm ANDCC : AsI1_bincc_irs<ANDri, ANDrr, ANDrsi, ANDrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm ORRCC : AsI1_bincc_irs<ORRri, ORRrr, ORRrsi, ORRrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm EORCC : AsI1_bincc_irs<EORri, EORrr, EORrsi, EORrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm ADDCC : AsI1_bincc_irs<ADDri, ADDrr, ADDrsi, ADDrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm SUBCC : AsI1_bincc_irs<SUBri, SUBrr, SUBrsi, SUBrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-
 } // neverHasSideEffects
 
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 048d340..8158a11 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1980,7 +1980,7 @@ def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
 def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
                        NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
 }
 
 def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
@@ -2023,7 +2023,7 @@ def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
 def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
                              NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
 }
 def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
                              extractelt, addrmode6oneL32> {
@@ -5045,25 +5045,23 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
                                            GPR:$R, imm:$lane))]> {
   let Inst{21} = lane{0};
 }
+
+def VSETLNi8Q : PseudoNeonI<(outs QPR:$V),
+                             (ins QPR:$src1, GPR:$R, VectorIndex8:$lane),
+                             IIC_VMOVISL, "",
+                             [(set QPR:$V, (vector_insert (v16i8 QPR:$src1),
+                                           GPR:$R, imm:$lane))]>;
+def VSETLNi16Q : PseudoNeonI<(outs QPR:$V),
+                             (ins QPR:$src1, GPR:$R, VectorIndex16:$lane),
+                             IIC_VMOVISL, "",
+                             [(set QPR:$V, (vector_insert (v8i16 QPR:$src1),
+                                           GPR:$R, imm:$lane))]>;
 }
-def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
-          (v16i8 (INSERT_SUBREG QPR:$src1,
-                  (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
-                                   (DSubReg_i8_reg imm:$lane))),
-                            GPR:$src2, (SubReg_i8_lane imm:$lane))),
-                  (DSubReg_i8_reg imm:$lane)))>;
-def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
-          (v8i16 (INSERT_SUBREG QPR:$src1,
-                  (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
-                                     (DSubReg_i16_reg imm:$lane))),
-                             GPR:$src2, (SubReg_i16_lane imm:$lane))),
-                  (DSubReg_i16_reg imm:$lane)))>;
+
 def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
-          (v4i32 (INSERT_SUBREG QPR:$src1,
-                  (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
-                                     (DSubReg_i32_reg imm:$lane))),
-                             GPR:$src2, (SubReg_i32_lane imm:$lane))),
-                  (DSubReg_i32_reg imm:$lane)))>;
+         (v4i32 (INSERT_SUBREG QPR:$src1,
+                 GPR:$src2,
+                 (SSubReg_f32_reg imm:$lane)))>;
 
 def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
           (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 554f6d9..e171f8b 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1200,6 +1200,7 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in
 def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
                               2, IIC_iALUi, []>;
 
+let hasSideEffects = 1 in
 def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
                               (ins i32imm:$label, nohash_imm:$id, pred:$p),
                               2, IIC_iALUi, []>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 8ecf009..f1a6cce 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -523,6 +523,23 @@ class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
   let Inst{7-4}   = opc7_4;
   let Inst{3-0}   = Rm;
 }
+class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
 
 
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
@@ -757,33 +774,6 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
      let Inst{24} = 1;
      let Inst{23-21} = op23_21;
    }
-
-   // Predicated versions.
-   def CCri : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_imm:$imm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUi, [],
-                             (!cast<Instruction>(NAME#ri) GPRnopc:$Rd,
-                              GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
-   def CCri12 : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPR:$Rn, imm0_4095:$imm,
-                                  pred:$p),
-                             4, IIC_iALUi, [],
-                             (!cast<Instruction>(NAME#ri12) GPRnopc:$Rd,
-                              GPR:$Rn, imm0_4095:$imm, pred:$p)>,
-                RegConstraint<"$Rfalse = $Rd">;
-   def CCrr : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, rGPR:$Rm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUr, [],
-                             (!cast<Instruction>(NAME#rr) GPRnopc:$Rd,
-                              GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
-   def CCrs : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_reg:$Rm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUsi, [],
-                             (!cast<Instruction>(NAME#rs) GPRnopc:$Rd,
-                              GPRnopc:$Rn, t2_so_reg:$Rm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
 }
 
 /// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
@@ -1200,6 +1190,7 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
 let neverHasSideEffects = 1, isReMaterializable = 1 in
 def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
                                 4, IIC_iALUi, []>;
+let hasSideEffects = 1 in
 def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$label, nohash_imm:$id, pred:$p),
                                 4, IIC_iALUi,
@@ -2437,15 +2428,17 @@ def t2UMULL : T2MulLong<0b010, 0b0000,
 } // isCommutable
 
 // Multiply + accumulate
-def t2SMLAL : T2MulLong<0b100, 0b0000,
+def t2SMLAL : T2MlaLong<0b100, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
-                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
 
-def t2UMLAL : T2MulLong<0b110, 0b0000,
+def t2UMLAL : T2MlaLong<0b110, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
-                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
 
 def t2UMAAL : T2MulLong<0b110, 0b0110,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
@@ -3049,37 +3042,6 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
                  RegConstraint<"$false = $Rd">;
 } // isCodeGenOnly = 1
 
-multiclass T2I_bincc_irs<Instruction iri, Instruction irr, Instruction irs,
-                   InstrItinClass iii, InstrItinClass iir, InstrItinClass iis> {
-   // shifted imm
-   def ri : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_imm:$imm,
-                                pred:$p, cc_out:$s),
-                           4, iii, [],
-                  (iri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-   // register
-   def rr : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, rGPR:$Rm,
-                                pred:$p, cc_out:$s),
-                           4, iir, [],
-                        (irr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-   // shifted register
-   def rs : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_reg:$ShiftedRm,
-                                pred:$p, cc_out:$s),
-                           4, iis, [],
-            (irs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-} // T2I_bincc_irs
-
-defm t2ANDCC : T2I_bincc_irs<t2ANDri, t2ANDrr, t2ANDrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2ORRCC : T2I_bincc_irs<t2ORRri, t2ORRrr, t2ORRrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2EORCC : T2I_bincc_irs<t2EORri, t2EORrr, t2EORrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index a812e21..2dc49a9 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -169,7 +169,7 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
       intptr_t LazyPtr = getIndirectSymAddr(Fn);
       if (!LazyPtr) {
         // In PIC mode, the function stub is loading a lazy-ptr.
-        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE);
+        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((const GlobalValue*)F, Fn, JCE);
         DEBUG(if (F)
                 errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
                        << "] for GV '" << F->getName() << "'\n";
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 3a5957b..e1e2f6e 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -181,49 +181,44 @@ class ARMAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index);
 
   // Asm Match Converter Methods
-  bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+  void cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  void cvtLdWriteBackRegAddrMode2(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+  void cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrMode2(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrMode3(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+  void cvtLdExtTWriteBackImm(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+  void cvtLdExtTWriteBackReg(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+  void cvtStExtTWriteBackImm(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+  void cvtStExtTWriteBackReg(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdrdPre(MCInst &Inst, unsigned Opcode,
-                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStrdPre(MCInst &Inst, unsigned Opcode,
-                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  void cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegAddrMode3(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+  void cvtThumbMultiply(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
+  void cvtVLDwbFixed(MCInst &Inst,
                      const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
+  void cvtVLDwbRegister(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
+  void cvtVSTwbFixed(MCInst &Inst,
                      const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
+  void cvtVSTwbRegister(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
@@ -267,6 +262,12 @@ public:
   bool MatchAndEmitInstruction(SMLoc IDLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
+
+  unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                           const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               unsigned OperandNum, unsigned &NumMCOperands) {
+    return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum, NumMCOperands);
+  }
 };
 } // end anonymous namespace
 
@@ -3880,8 +3881,8 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// cvtT2LdrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtT2LdrdPre(MCInst &Inst,
              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -3892,14 +3893,13 @@ cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtT2StrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtT2StrdPre(MCInst &Inst,
              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateReg(0));
@@ -3910,14 +3910,13 @@ cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3926,28 +3925,26 @@ cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
 
   ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode2(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3956,14 +3953,13 @@ cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
 
   ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3972,57 +3968,53 @@ cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
 
   ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 
 /// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode2(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode3(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdExtTWriteBackImm(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4034,14 +4026,13 @@ cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdExtTWriteBackReg(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4053,14 +4044,13 @@ cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStExtTWriteBackImm(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4072,14 +4062,13 @@ cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStExtTWriteBackReg(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4091,14 +4080,13 @@ cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdrdPre(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4109,14 +4097,13 @@ cvtLdrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStrdPre(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4127,40 +4114,27 @@ cvtStrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode3(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-/// cvtThumbMultiple- Convert parsed operands to MCInst.
+/// cvtThumbMultiply - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtThumbMultiply(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // The second source operand must be the same register as the destination
-  // operand.
-  if (Operands.size() == 6 &&
-      (((ARMOperand*)Operands[3])->getReg() !=
-       ((ARMOperand*)Operands[5])->getReg()) &&
-      (((ARMOperand*)Operands[3])->getReg() !=
-       ((ARMOperand*)Operands[4])->getReg())) {
-    Error(Operands[3]->getStartLoc(),
-          "destination register must match source register");
-    return false;
-  }
   ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
   // If we have a three-operand form, make sure to set Rn to be the operand
@@ -4173,12 +4147,10 @@ cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1);
   Inst.addOperand(Inst.getOperand(0));
   ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
-
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVLDwbFixed(MCInst &Inst,
               const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
@@ -4188,11 +4160,10 @@ cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVLDwbRegister(MCInst &Inst,
                  const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
@@ -4204,11 +4175,10 @@ cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVSTwbFixed(MCInst &Inst,
               const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4218,11 +4188,10 @@ cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVSTwbRegister(MCInst &Inst,
                  const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4234,7 +4203,6 @@ cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
@@ -5377,6 +5345,25 @@ validateInstruction(MCInst &Inst,
                    "in register list");
     break;
   }
+  case ARM::tMUL: {
+    // The second source operand must be the same register as the destination
+    // operand.
+    //
+    // In this case, we must directly check the parsed operands because the
+    // cvtThumbMultiply() function is written in such a way that it guarantees
+    // this first statement is always true for the new Inst.  Essentially, the
+    // destination is unconditionally copied into the second source operand
+    // without checking to see if it matches what we actually parsed.
+    if (Operands.size() == 6 &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[5])->getReg()) &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[4])->getReg())) {
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register must match source register");
+    }
+    break;
+  }
   // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
@@ -7475,9 +7462,11 @@ MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
   MCInst Inst;
+  unsigned Kind;
   unsigned ErrorInfo;
   unsigned MatchResult;
-  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo);
+
+  MatchResult = MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo);
   switch (MatchResult) {
   default: break;
   case Match_Success:
@@ -7540,9 +7529,6 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction",
                  ((ARMOperand*)Operands[0])->getLocRange());
-  case Match_ConversionFail:
-    // The converter function will have already emitted a diagnostic.
-    return true;
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
   case Match_RequiresITBlock:
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index c90751d..57642e1 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -2701,6 +2701,8 @@ static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
   unsigned align = fieldFromInstruction(Insn, 4, 1);
   unsigned size = fieldFromInstruction(Insn, 6, 2);
 
+  if (size == 0 && align == 1)
+    return MCDisassembler::Fail;
   align *= (1 << size);
 
   switch (Inst.getOpcode()) {
@@ -2831,6 +2833,8 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
   unsigned align = fieldFromInstruction(Insn, 4, 1);
 
   if (size == 0x3) {
+    if (align == 0)
+      return MCDisassembler::Fail;
     size = 4;
     align = 16;
   } else {
@@ -3170,7 +3174,7 @@ static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
     int imm = Val & 0xFF;
 
     if (!(Val & 0x100)) imm *= -1;
-    Inst.addOperand(MCOperand::CreateImm(imm << 2));
+    Inst.addOperand(MCOperand::CreateImm(imm * 4));
   }
 
   return MCDisassembler::Success;
@@ -3710,8 +3714,16 @@ static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
       if (fieldFromInstruction(Insn, 6, 1))
         return MCDisassembler::Fail; // UNDEFINED
       index = fieldFromInstruction(Insn, 7, 1);
-      if (fieldFromInstruction(Insn, 4, 2) != 0)
-        align = 4;
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0 :
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
   }
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
@@ -3769,8 +3781,16 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
       if (fieldFromInstruction(Insn, 6, 1))
         return MCDisassembler::Fail; // UNDEFINED
       index = fieldFromInstruction(Insn, 7, 1);
-      if (fieldFromInstruction(Insn, 4, 2) != 0)
-        align = 4;
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0: 
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
   }
 
   if (Rm != 0xF) { // Writeback
@@ -4090,8 +4110,15 @@ static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
         inc = 2;
       break;
     case 2:
-      if (fieldFromInstruction(Insn, 4, 2))
-        align = 4 << fieldFromInstruction(Insn, 4, 2);
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
       index = fieldFromInstruction(Insn, 7, 1);
       if (fieldFromInstruction(Insn, 6, 1))
         inc = 2;
@@ -4164,8 +4191,15 @@ static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
         inc = 2;
       break;
     case 2:
-      if (fieldFromInstruction(Insn, 4, 2))
-        align = 4 << fieldFromInstruction(Insn, 4, 2);
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
       index = fieldFromInstruction(Insn, 7, 1);
       if (fieldFromInstruction(Insn, 6, 1))
         inc = 2;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 7d6acbc..b53da3b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -194,6 +194,10 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case ARM::fixup_arm_uncondbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
+    case ARM::fixup_t2_condbranch:
+    case ARM::fixup_t2_uncondbranch:
+      Type = ELF::R_ARM_THM_JUMP24;
+      break;
     case ARM::fixup_arm_movt_hi16:
     case ARM::fixup_arm_movt_hi16_pcrel:
       Type = ELF::R_ARM_MOVT_PREL;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index d32805e..c1aab9c 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -50,7 +50,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   Code32Directive = ".code\t32";
 
   WeakRefDirective = "\t.weak\t";
-  LCOMMDirectiveType = LCOMM::NoAlignment;
 
   HasLEB128 = true;
   SupportsDebugInformation = true;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 94f1082..1917564 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -783,7 +783,7 @@ getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
 
   // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
   if (Imm8 < 0)
-    Imm8 = -Imm8;
+    Imm8 = -(uint32_t)Imm8;
 
   // Scaled by 4.
   Imm8 /= 4;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index a51e0fa..95640f7 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -410,7 +410,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   if (Type == macho::RIT_ARM_Half) {
     // The other-half value only gets populated for the movt and movw
     // relocation entries.
-    uint32_t Value = 0;;
+    uint32_t Value = 0;
     switch ((unsigned)Fixup.getKind()) {
     default: break;
     case ARM::fixup_arm_movw_lo16:
diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index 03d5a9a..3396e8b 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -130,8 +130,7 @@ namespace {
     void
     printS10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
     {
-      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
-                             >> 16);
+      short value = MI->getOperand(OpNo).getImm();
       assert((value >= -(1 << 9) && value <= (1 << 9) - 1)
              && "Invalid s10 argument");
       O << value;
@@ -140,8 +139,7 @@ namespace {
     void
     printU10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
     {
-      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
-                             >> 16);
+      short value = MI->getOperand(OpNo).getImm();
       assert((value <= (1 << 10) - 1) && "Invalid u10 argument");
       O << value;
     }
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index c27caea..425371d 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -83,12 +83,10 @@ namespace {
       return true;
     } else if (vt == MVT::i32) {
       int32_t i_val = (int32_t) CN->getZExtValue();
-      short s_val = (short) i_val;
-      return i_val == s_val;
+      return i_val == SignExtend32<16>(i_val);
     } else {
       int64_t i_val = (int64_t) CN->getZExtValue();
-      short s_val = (short) i_val;
-      return i_val == s_val;
+      return i_val == SignExtend64<16>(i_val);
     }
   }
 
@@ -99,9 +97,10 @@ namespace {
     EVT vt = FPN->getValueType(0);
     if (vt == MVT::f32) {
       int val = FloatToBits(FPN->getValueAPF().convertToFloat());
-      int sval = (int) ((val << 16) >> 16);
-      Imm = (short) val;
-      return val == sval;
+      if (val == SignExtend32<16>(val)) {
+        Imm = (short) val;
+        return true;
+      }
     }
 
     return false;
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 1f2d8ac..306084b 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_target(HexagonCodeGen
   HexagonExpandPredSpillCode.cpp
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
+  HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
new file mode 100644
index 0000000..b131a8f
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -0,0 +1,952 @@
+//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "HexagonMachineScheduler.h"
+
+#include <queue>
+
+using namespace llvm;
+
+static cl::opt<bool> ForceTopDown("vliw-misched-topdown", cl::Hidden,
+                                  cl::desc("Force top-down list scheduling"));
+static cl::opt<bool> ForceBottomUp("vliw-misched-bottomup", cl::Hidden,
+                                   cl::desc("Force bottom-up list scheduling"));
+
+#ifndef NDEBUG
+static cl::opt<bool> ViewMISchedDAGs("vliw-view-misched-dags", cl::Hidden,
+  cl::desc("Pop up a window to show MISched dags after they are processed"));
+
+static cl::opt<unsigned> MISchedCutoff("vliw-misched-cutoff", cl::Hidden,
+  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
+#else
+static bool ViewMISchedDAGs = false;
+#endif // NDEBUG
+
+/// Decrement this iterator until reaching the top or a non-debug instr.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+  assert(I != Beg && "reached the top of the region, cannot decrement");
+  while (--I != Beg) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
+/// NumPredsLeft reaches zero, release the successor node.
+///
+/// FIXME: Adjust SuccSU height based on MinLatency.
+void VLIWMachineScheduler::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    SchedImpl->releaseTopNode(SuccSU);
+}
+
+/// releaseSuccessors - Call releaseSucc on each of SU's successors.
+void VLIWMachineScheduler::releaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    releaseSucc(SU, &*I);
+  }
+}
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
+/// NumSuccsLeft reaches zero, release the predecessor node.
+///
+/// FIXME: Adjust PredSU height based on MinLatency.
+void VLIWMachineScheduler::releasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
+    SchedImpl->releaseBottomNode(PredSU);
+}
+
+/// releasePredecessors - Call releasePred on each of SU's predecessors.
+void VLIWMachineScheduler::releasePredecessors(SUnit *SU) {
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    releasePred(SU, &*I);
+  }
+}
+
+void VLIWMachineScheduler::moveInstruction(MachineInstr *MI,
+                                    MachineBasicBlock::iterator InsertPos) {
+  // Advance RegionBegin if the first instruction moves down.
+  if (&*RegionBegin == MI)
+    ++RegionBegin;
+
+  // Update the instruction stream.
+  BB->splice(InsertPos, BB, MI);
+
+  // Update LiveIntervals
+  LIS->handleMove(MI);
+
+  // Recede RegionBegin if an instruction moves above the first.
+  if (RegionBegin == InsertPos)
+    RegionBegin = MI;
+}
+
+bool VLIWMachineScheduler::checkSchedLimit() {
+#ifndef NDEBUG
+  if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
+    CurrentTop = CurrentBottom;
+    return false;
+  }
+  ++NumInstrsScheduled;
+#endif
+  return true;
+}
+
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void VLIWMachineScheduler::enterRegion(MachineBasicBlock *bb,
+                                MachineBasicBlock::iterator begin,
+                                MachineBasicBlock::iterator end,
+                                unsigned endcount)
+{
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+
+  // For convenience remember the end of the liveness region.
+  LiveRegionEnd =
+    (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+}
+
+// Setup the register pressure trackers for the top scheduled top and bottom
+// scheduled regions.
+void VLIWMachineScheduler::initRegPressure() {
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Close the RPTracker to finalize live ins.
+  RPTracker.closeRegion();
+
+  DEBUG(RPTracker.getPressure().dump(TRI));
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Close one end of the tracker so we can call
+  // getMaxUpward/DownwardPressureDelta before advancing across any
+  // instructions. This converts currently live regs into live ins/outs.
+  TopRPTracker.closeTop();
+  BotRPTracker.closeBottom();
+
+  // Account for liveness generated by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    BotRPTracker.recede();
+
+  assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
+
+  // Cache the list of excess pressure sets in this region. This will also track
+  // the max pressure in the scheduled code for these sets.
+  RegionCriticalPSets.clear();
+  std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure;
+  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    DEBUG(dbgs() << TRI->getRegPressureSetName(i)
+          << "Limit " << Limit
+          << " Actual " << RegionPressure[i] << "\n");
+    if (RegionPressure[i] > Limit)
+      RegionCriticalPSets.push_back(PressureElement(i, 0));
+  }
+  DEBUG(dbgs() << "Excess PSets: ";
+        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+          dbgs() << TRI->getRegPressureSetName(
+            RegionCriticalPSets[i].PSetID) << " ";
+        dbgs() << "\n");
+
+  TotalPackets = 0;
+}
+
+// FIXME: When the pressure tracker deals in pressure differences then we won't
+// iterate over all RegionCriticalPSets[i].
+void VLIWMachineScheduler::
+updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
+  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
+    unsigned ID = RegionCriticalPSets[i].PSetID;
+    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
+    if ((int)NewMaxPressure[ID] > MaxUnits)
+      MaxUnits = NewMaxPressure[ID];
+  }
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(SU->getInstr()))
+      return false;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    if (Packet[i]->Succs.size() == 0)
+      continue;
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order dependencies.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU) {
+  bool startNewCycle = false;
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU)) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    DEBUG(dbgs() << "\t[" << i << "] SU(");
+    DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  return startNewCycle;
+}
+
+// Release all DAG roots for scheduling.
+void VLIWMachineScheduler::releaseRoots() {
+  SmallVector<SUnit*, 16> BotRoots;
+
+  for (std::vector<SUnit>::iterator
+         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+    // A SUnit is ready to top schedule if it has no predecessors.
+    if (I->Preds.empty())
+      SchedImpl->releaseTopNode(&(*I));
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (I->Succs.empty())
+      BotRoots.push_back(&(*I));
+  }
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I)
+    SchedImpl->releaseBottomNode(*I);
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  DEBUG(dbgs()
+        << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
+        << " " << BB->getName()
+        << " in_func " << BB->getParent()->getFunction()->getName()
+        << " at loop depth "  << MLI->getLoopDepth(BB)
+        << " \n");
+
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker);
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
+
+  // To view Height/Depth correctly, they should be accessed at least once.
+  DEBUG(unsigned maxH = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getHeight() > maxH)
+            maxH = SUnits[su].getHeight();
+        dbgs() << "Max Height " << maxH << "\n";);
+  DEBUG(unsigned maxD = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getDepth() > maxD)
+            maxD = SUnits[su].getDepth();
+        dbgs() << "Max Depth " << maxD << "\n";);
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  if (ViewMISchedDAGs) viewGraph();
+
+  SchedImpl->initialize(this);
+
+  // Release edges from the special Entry node or to the special Exit node.
+  releaseSuccessors(&EntrySU);
+  releasePredecessors(&ExitSU);
+
+  // Release all DAG roots for scheduling.
+  releaseRoots();
+
+  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
+  CurrentBottom = RegionEnd;
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    if (!checkSchedLimit())
+      break;
+
+    // Move the instruction to its new location in the instruction stream.
+    MachineInstr *MI = SU->getInstr();
+
+    if (IsTopNode) {
+      assert(SU->isTopReady() && "node still has unscheduled dependencies");
+      if (&*CurrentTop == MI)
+        CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+      else {
+        moveInstruction(MI, CurrentTop);
+        TopRPTracker.setPos(MI);
+      }
+
+      // Update top scheduled pressure.
+      TopRPTracker.advance();
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+
+      // Release dependent instructions for scheduling.
+      releaseSuccessors(SU);
+    } else {
+      assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+      MachineBasicBlock::iterator priorII =
+        priorNonDebug(CurrentBottom, CurrentTop);
+      if (&*priorII == MI)
+        CurrentBottom = priorII;
+      else {
+        if (&*CurrentTop == MI) {
+          CurrentTop = nextIfDebug(++CurrentTop, priorII);
+          TopRPTracker.setPos(CurrentTop);
+        }
+        moveInstruction(MI, CurrentBottom);
+        CurrentBottom = MI;
+      }
+      // Update bottom scheduled pressure.
+      BotRPTracker.recede();
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+
+      // Release dependent instructions for scheduling.
+      releasePredecessors(SU);
+    }
+    SU->isScheduled = true;
+    SchedImpl->schedNode(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+}
+
+/// Reinsert any remaining debug_values, just like the PostRA scheduler.
+void VLIWMachineScheduler::placeDebugValues() {
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue) {
+    BB->splice(RegionBegin, BB, FirstDbgValue);
+    RegionBegin = FirstDbgValue;
+  }
+
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrevMI = P.second;
+    BB->splice(++OrigPrevMI, BB, DbgValue);
+    if (OrigPrevMI == llvm::prior(RegionEnd))
+      RegionEnd = DbgValue;
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
+}
+
+void ConvergingVLIWScheduler::initialize(VLIWMachineScheduler *dag) {
+  DAG = dag;
+  TRI = DAG->TRI;
+  Top.DAG = dag;
+  Bot.DAG = dag;
+
+  // Initialize the HazardRecognizers.
+  const TargetMachine &TM = DAG->MF.getTarget();
+  const InstrItineraryData *Itin = TM.getInstrItineraryData();
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  Top.ResourceModel = new VLIWResourceModel(TM);
+  Bot.ResourceModel = new VLIWResourceModel(TM);
+
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  if (IssueCount + DAG->getNumMicroOps(SU->getInstr()) > DAG->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::SchedBoundary::bumpCycle() {
+  unsigned Width = DAG->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+        << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU);
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += DAG->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+  else
+    DEBUG(dbgs() << "*** IssueCount " << IssueCount
+          << " at cycle " << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return NULL;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+                                             const ReadyQueue &Q,
+                                             SUnit *SU, PressureElement P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  SU->dump(DAG);
+}
+#endif
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
+  SUnit *OnlyAvailableSucc = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    SUnit &Succ = *I->getSUnit();
+    if (!Succ.isScheduled) {
+      // We found an available, but not scheduled, successor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
+        return 0;
+      OnlyAvailableSucc = &Succ;
+    }
+  }
+  return OnlyAvailableSucc;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 100;
+static const unsigned PriorityThree = 50;
+static const unsigned PriorityFour = 20;
+static const unsigned ScaleTwo = 10;
+static const unsigned FactorOne = 2;
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    ResCount += (SU->getHeight() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  } else {
+    ResCount += (SU->getDepth() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I)
+      if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I)
+      if (getSingleUnscheduledSucc(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  // Factor in reg pressure as a heuristic.
+  ResCount -= (Delta.Excess.UnitIncrease*PriorityThree);
+  ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree);
+
+  DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      DEBUG(traceCandidate("CCAND", Q, *I));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return NULL;
+  }
+  SUnit *SU;
+  if (ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
+
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
new file mode 100644
index 0000000..f3643d6
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -0,0 +1,437 @@
+//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler.      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom Hexagon MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/PriorityQueue.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineSchedStrategy - Interface to a machine scheduling algorithm.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class VLIWMachineScheduler;
+
+/// MachineSchedStrategy - Interface used by VLIWMachineScheduler to drive
+/// the selected scheduling algorithm.
+///
+/// TODO: Move this to ScheduleDAGInstrs.h
+class MachineSchedStrategy {
+public:
+  virtual ~MachineSchedStrategy() {}
+
+  /// Initialize the strategy after building the DAG for a new region.
+  virtual void initialize(VLIWMachineScheduler *DAG) = 0;
+
+  /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to
+  /// schedule the node at the top of the unscheduled region. Otherwise it will
+  /// be scheduled at the bottom.
+  virtual SUnit *pickNode(bool &IsTopNode) = 0;
+
+  /// Notify MachineSchedStrategy that VLIWMachineScheduler has
+  /// scheduled a node.
+  virtual void schedNode(SUnit *SU, bool IsTopNode) = 0;
+
+  /// When all predecessor dependencies have been resolved, free this node for
+  /// top-down scheduling.
+  virtual void releaseTopNode(SUnit *SU) = 0;
+  /// When all successor dependencies have been resolved, free this node for
+  /// bottom-up scheduling.
+  virtual void releaseBottomNode(SUnit *SU) = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience
+/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified
+/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in.
+class ReadyQueue {
+  unsigned ID;
+  std::string Name;
+  std::vector<SUnit*> Queue;
+
+public:
+  ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {}
+
+  unsigned getID() const { return ID; }
+
+  StringRef getName() const { return Name; }
+
+  // SU is in this queue if it's NodeQueueID is a superset of this ID.
+  bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); }
+
+  bool empty() const { return Queue.empty(); }
+
+  unsigned size() const { return Queue.size(); }
+
+  typedef std::vector<SUnit*>::iterator iterator;
+
+  iterator begin() { return Queue.begin(); }
+
+  iterator end() { return Queue.end(); }
+
+  iterator find(SUnit *SU) {
+    return std::find(Queue.begin(), Queue.end(), SU);
+  }
+
+  void push(SUnit *SU) {
+    Queue.push_back(SU);
+    SU->NodeQueueId |= ID;
+  }
+
+  void remove(iterator I) {
+    (*I)->NodeQueueId &= ~ID;
+    *I = Queue.back();
+    Queue.pop_back();
+  }
+
+  void dump() {
+    dbgs() << Name << ": ";
+    for (unsigned i = 0, e = Queue.size(); i < e; ++i)
+      dbgs() << Queue[i]->NodeNum << " ";
+    dbgs() << "\n";
+  }
+};
+
+class VLIWResourceModel {
+  /// ResourcesModel - Represents VLIW state.
+  /// Not limited to VLIW targets per say, but assumes
+  /// definition of DFA by a target.
+  DFAPacketizer *ResourcesModel;
+
+  const InstrItineraryData *InstrItins;
+
+  /// Local packet/bundle model. Purely
+  /// internal to the MI schedulre at the time.
+  std::vector<SUnit*> Packet;
+
+  /// Total packets created.
+  unsigned TotalPackets;
+
+public:
+  VLIWResourceModel(MachineSchedContext *C, const InstrItineraryData *IID) :
+    InstrItins(IID), TotalPackets(0) {
+    const TargetMachine &TM = C->MF->getTarget();
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(InstrItins->SchedModel->IssueWidth);
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  VLIWResourceModel(const TargetMachine &TM) :
+    InstrItins(TM.getInstrItineraryData()), TotalPackets(0) {
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(InstrItins->SchedModel->IssueWidth);
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  ~VLIWResourceModel() {
+    delete ResourcesModel;
+  }
+
+  void resetPacketState() {
+    Packet.clear();
+  }
+
+  void resetDFA() {
+    ResourcesModel->clearResources();
+  }
+
+  void reset() {
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  bool isResourceAvailable(SUnit *SU);
+  bool reserveResources(SUnit *SU);
+  unsigned getTotalPackets() const { return TotalPackets; }
+};
+
+class VLIWMachineScheduler : public ScheduleDAGInstrs {
+  /// AA - AliasAnalysis for making memory reference queries.
+  AliasAnalysis *AA;
+
+  RegisterClassInfo *RegClassInfo;
+  MachineSchedStrategy *SchedImpl;
+
+  MachineBasicBlock::iterator LiveRegionEnd;
+
+  /// Register pressure in this region computed by buildSchedGraph.
+  IntervalPressure RegPressure;
+  RegPressureTracker RPTracker;
+
+  /// List of pressure sets that exceed the target's pressure limit before
+  /// scheduling, listed in increasing set ID order. Each pressure set is paired
+  /// with its max pressure in the currently scheduled regions.
+  std::vector<PressureElement> RegionCriticalPSets;
+
+  /// The top of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentTop;
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  /// The bottom of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentBottom;
+  IntervalPressure BotPressure;
+  RegPressureTracker BotRPTracker;
+
+#ifndef NDEBUG
+  /// The number of instructions scheduled so far. Used to cut off the
+  /// scheduler at the point determined by misched-cutoff.
+  unsigned NumInstrsScheduled;
+#endif
+
+  /// Total packets in the region.
+  unsigned TotalPackets;
+
+  const MachineLoopInfo *MLI;
+public:
+  VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
+    ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
+    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
+    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
+    CurrentBottom(), BotRPTracker(BotPressure), MLI(C->MLI) {
+#ifndef NDEBUG
+    NumInstrsScheduled = 0;
+#endif
+    TotalPackets = 0;
+  }
+
+  virtual ~VLIWMachineScheduler() {
+    delete SchedImpl;
+  }
+
+  MachineBasicBlock::iterator top() const { return CurrentTop; }
+  MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
+
+  /// Implement the ScheduleDAGInstrs interface for handling the next scheduling
+  /// region. This covers all instructions in a block, while schedule() may only
+  /// cover a subset.
+  void enterRegion(MachineBasicBlock *bb,
+                   MachineBasicBlock::iterator begin,
+                   MachineBasicBlock::iterator end,
+                   unsigned endcount);
+
+  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
+  /// time to do some work.
+  void schedule();
+
+  unsigned CurCycle;
+
+  /// Get current register pressure for the top scheduled instructions.
+  const IntervalPressure &getTopPressure() const { return TopPressure; }
+  const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; }
+
+  /// Get current register pressure for the bottom scheduled instructions.
+  const IntervalPressure &getBotPressure() const { return BotPressure; }
+  const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; }
+
+  /// Get register pressure for the entire scheduling region before scheduling.
+  const IntervalPressure &getRegPressure() const { return RegPressure; }
+
+  const std::vector<PressureElement> &getRegionCriticalPSets() const {
+    return RegionCriticalPSets;
+  }
+
+  /// getIssueWidth - Return the max instructions per scheduling group.
+  unsigned getIssueWidth() const {
+    return (InstrItins && InstrItins->SchedModel)
+      ? InstrItins->SchedModel->IssueWidth : 1;
+  }
+
+  /// getNumMicroOps - Return the number of issue slots required for this MI.
+  unsigned getNumMicroOps(MachineInstr *MI) const {
+    return 1;
+    //if (!InstrItins) return 1;
+    //int UOps = InstrItins->getNumMicroOps(MI->getDesc().getSchedClass());
+    //return (UOps >= 0) ? UOps : TII->getNumMicroOps(InstrItins, MI);
+  }
+
+private:
+  void scheduleNodeTopDown(SUnit *SU);
+  void listScheduleTopDown();
+
+  void initRegPressure();
+  void updateScheduledPressure(std::vector<unsigned> NewMaxPressure);
+
+  void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
+  bool checkSchedLimit();
+
+  void releaseRoots();
+
+  void releaseSucc(SUnit *SU, SDep *SuccEdge);
+  void releaseSuccessors(SUnit *SU);
+  void releasePred(SUnit *SU, SDep *PredEdge);
+  void releasePredecessors(SUnit *SU);
+
+  void placeDebugValues();
+};
+
+/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
+/// to balance the schedule.
+class ConvergingVLIWScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingVLIWScheduler heuristics, required
+  ///  for the lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Best scheduling cost.
+    int SCost;
+
+    SchedCandidate(): SU(NULL), SCost(0) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
+    BestCost};
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct SchedBoundary {
+    VLIWMachineScheduler *DAG;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+    VLIWResourceModel *ResourceModel;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    SchedBoundary(unsigned ID, const Twine &Name):
+      DAG(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(0), ResourceModel(0),
+      CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~SchedBoundary() {
+      delete ResourceModel;
+      delete HazardRec;
+    }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingVLIWScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
+  VLIWMachineScheduler *DAG;
+  const TargetRegisterInfo *TRI;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
+
+public:
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingVLIWScheduler():
+    DAG(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(VLIWMachineScheduler *dag);
+
+  virtual SUnit *pickNode(bool &IsTopNode);
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+
+  virtual void releaseTopNode(SUnit *SU);
+
+  virtual void releaseBottomNode(SUnit *SU);
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
+
+  int SchedulingCost(ReadyQueue &Q,
+                     SUnit *SU, SchedCandidate &Candidate,
+                     RegPressureDelta &Delta, bool verbose);
+
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      PressureElement P = PressureElement());
+#endif
+};
+
+} // namespace
+
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 7ece408..1e91c39 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -337,7 +337,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
                << "********** Function: "
-               << MF.getFunction()->getName() << "\n");
+               << MF.getName() << "\n");
 
 #if 0
   // for now disable this, if we move NewValueJump before register
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 55cbc09..a295015 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -109,6 +109,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
 
   DenseMap<unsigned, unsigned> PeepholeMap;
+  DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;
 
   if (DisableHexagonPeephole) return false;
 
@@ -117,6 +118,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
        MBBb != MBBe; ++MBBb) {
     MachineBasicBlock* MBB = MBBb;
     PeepholeMap.clear();
+    PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
@@ -140,6 +142,24 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
+      // Look for this sequence below
+      // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
+      // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
+      // and convert into
+      // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
+      if (MI->getOpcode() == Hexagon::LSRd_ri) {
+        assert(MI->getNumOperands() == 3);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src1 = MI->getOperand(1);
+        MachineOperand &Src2 = MI->getOperand(2);
+        if (Src2.getImm() != 32)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src1.getReg();
+        PeepholeDoubleRegsMap[DstReg] =
+          std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
+      }
+
       // Look for P=NOT(P).
       if (!DisablePNotP &&
           (MI->getOpcode() == Hexagon::NOT_p)) {
@@ -178,6 +198,21 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
             // Change the 1st operand.
             MI->RemoveOperand(1);
             MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+          } else  {
+            DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
+              PeepholeDoubleRegsMap.find(SrcReg);
+            if (DI != PeepholeDoubleRegsMap.end()) {
+              std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
+              MI->RemoveOperand(1);
+              MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first,
+                                                       false /*isDef*/,
+                                                       false /*isImp*/,
+                                                       false /*isKill*/,
+                                                       false /*isDead*/,
+                                                       false /*isUndef*/,
+                                                       false /*isEarlyClobber*/,
+                                                       PeepholeSrc.second));
+            }
           }
         }
       }
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2c23674..3742486 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -310,6 +310,58 @@ void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove>
   Moves.push_back(MachineMove(0, Dst, Src));
 }
 
+// Get the weight in units of pressure for this register class.
+const RegClassWeight &
+HexagonRegisterInfo::getRegClassWeight(const TargetRegisterClass *RC) const {
+  // Each TargetRegisterClass has a per register weight, and weight
+  // limit which must be less than the limits of its pressure sets.
+  static const RegClassWeight RCWeightTable[] = {
+    {1, 32}, // IntRegs
+    {1, 8},  // CRRegs
+    {1, 4},  // PredRegs
+    {2, 16}, // DoubleRegs
+    {0, 0} };
+  return RCWeightTable[RC->getID()];
+}
+
+/// Get the number of dimensions of register pressure.
+unsigned HexagonRegisterInfo::getNumRegPressureSets() const {
+  return 4;
+}
+
+/// Get the name of this register unit pressure set.
+const char *HexagonRegisterInfo::getRegPressureSetName(unsigned Idx) const {
+  static const char *const RegPressureSetName[] = {
+    "IntRegsRegSet",
+    "CRRegsRegSet",
+    "PredRegsRegSet",
+    "DoubleRegsRegSet"
+  };
+  assert((Idx < 4) && "Index out of bounds");
+  return RegPressureSetName[Idx];
+}
+
+/// Get the register unit pressure limit for this dimension.
+/// This limit must be adjusted dynamically for reserved registers.
+unsigned HexagonRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+  static const int RegPressureLimit [] = { 16, 4, 2, 8 };
+  assert((Idx < 4) && "Index out of bounds");
+  return RegPressureLimit[Idx];
+}
+
+const int*
+HexagonRegisterInfo::getRegClassPressureSets(const TargetRegisterClass *RC)
+  const {
+  static const int RCSetsTable[] = {
+    0,  -1,  // IntRegs
+    1,  -1,  // CRRegs
+    2,  -1,  // PredRegs
+    0,  -1,  // DoubleRegs
+    -1 };
+  static const unsigned RCSetStartTable[] = { 0, 2, 4, 6, 0 };
+  unsigned SetListStart = RCSetStartTable[RC->getID()];
+  return &RCSetsTable[SetListStart];
+}
 unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
   llvm_unreachable("What is the exception register");
 }
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 85355ae..8820d13 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -87,6 +87,11 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
+  const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+  unsigned getNumRegPressureSets() const;
+  const char *getRegPressureSetName(unsigned Idx) const;
+  unsigned getRegPressureSetLimit(unsigned Idx) const;
+  const int* getRegClassPressureSets(const TargetRegisterClass *RC) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index d1076b8..b5ff69a 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -47,6 +47,7 @@ def HexagonModel : SchedMachineModel {
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
   let Itineraries = HexagonItineraries;
+  let LoadLatency = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 9b41126..5668ae8 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -58,6 +58,7 @@ def HexagonModelV4 : SchedMachineModel {
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
   let Itineraries = HexagonItinerariesV4;
+  let LoadLatency = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index a7b291f..5688e9c 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "HexagonTargetMachine.h"
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
+#include "HexagonMachineScheduler.h"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
@@ -29,6 +30,11 @@ opt<bool> DisableHardwareLoops(
                         "disable-hexagon-hwloops", cl::Hidden,
                         cl::desc("Disable Hardware Loops for Hexagon target"));
 
+static cl::
+opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+                                cl::Hidden, cl::ZeroOrMore, cl::init(false),
+                                cl::desc("Disable Hexagon MI Scheduling"));
+
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -42,6 +48,13 @@ extern "C" void LLVMInitializeHexagonTarget() {
   RegisterTargetMachine<HexagonTargetMachine> X(TheHexagonTarget);
 }
 
+static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
+  return new VLIWMachineScheduler(C, new ConvergingVLIWScheduler());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                    createVLIWMachineSched);
 
 /// HexagonTargetMachine ctor - Create an ILP32 architecture model.
 ///
@@ -83,7 +96,13 @@ namespace {
 class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    // Enable MI scheduler.
+    if (!DisableHexagonMISched) {
+      enablePass(&MachineSchedulerID);
+      MachineSchedRegistry::setDefault(createVLIWMachineSched);
+    }
+  }
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index a03ed03..3d5f685 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -3474,8 +3474,8 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       // 1. Two loads unless they are volatile.
       // 2. Two stores in V4 unless they are volatile.
       else if ((DepType == SDep::Order) &&
-               !I->hasVolatileMemoryRef() &&
-               !J->hasVolatileMemoryRef()) {
+               !I->hasOrderedMemoryRef() &&
+               !J->hasOrderedMemoryRef()) {
         if (QRI->Subtarget.hasV4TOps() &&
             // hexagonv4 allows dual store.
             MCIDI.mayStore() && MCIDJ.mayStore()) {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index d6e6c36..86f75d1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -24,7 +24,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) {
   HasLEB128 = true;
 
   PrivateGlobalPrefix = ".L";
-  LCOMMDirectiveType = LCOMM::ByteAlignment;
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
   InlineAsmEnd = "# InlineAsm End";
   ZeroDirective = "\t.space\t";
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index 38fb0e8..4059403 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -56,6 +56,12 @@ class MBlazeAsmParser : public MCTargetAsmParser {
 
   /// }
 
+  unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               unsigned OperandNum, unsigned &NumMCOperands) {
+    return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum,
+                                   NumMCOperands);
+  }
 
 public:
   MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
@@ -317,10 +323,10 @@ MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
   MCInst Inst;
-  SMLoc ErrorLoc;
+  unsigned Kind;
   unsigned ErrorInfo;
 
-  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo)) {
+  switch (MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo)) {
   default: break;
   case Match_Success:
     Out.EmitInstruction(Inst);
@@ -329,10 +335,8 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(IDLoc, "instruction use requires an option to be enabled");
   case Match_MnemonicFail:
       return Error(IDLoc, "unrecognized instruction mnemonic");
-  case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction");
-  case Match_InvalidOperand:
-    ErrorLoc = IDLoc;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0U) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
@@ -343,6 +347,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
 
     return Error(ErrorLoc, "invalid operand for instruction");
   }
+  }
 
   llvm_unreachable("Implement any new match types added!");
 }
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index 46f5207..daa76e8 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -140,7 +140,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   unsigned oi = i == 2 ? 1 : 2;
 
-  DEBUG(dbgs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n";
         dbgs() << "<--------->\n" << MI);
 
   int FrameIndex = MI.getOperand(i).getIndex();
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 43bd345..8418b75 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -8,20 +8,36 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
 namespace {
+
 class MipsAsmParser : public MCTargetAsmParser {
 
+  enum FpFormatTy {
+    FP_FORMAT_NONE = -1,
+    FP_FORMAT_S,
+    FP_FORMAT_D,
+    FP_FORMAT_L,
+    FP_FORMAT_W
+  } FpFormat;
+
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
@@ -34,14 +50,67 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool ParseInstruction(StringRef Name, SMLoc NameLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
+  bool parseMathOperation(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
   bool ParseDirective(AsmToken DirectiveID);
 
-  OperandMatchResultTy parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&);
+  MipsAsmParser::OperandMatchResultTy
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&);
+
+  unsigned
+  getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                      unsigned OperandNum, unsigned &NumMCOperands);
+
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
+                    StringRef Mnemonic);
+
+  int tryParseRegister(StringRef Mnemonic);
+
+  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               StringRef Mnemonic);
+
+  bool parseMemOffset(const MCExpr *&Res);
+  bool parseRelocOperand(const MCExpr *&Res);
+  MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
+
+  bool isMips64() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64) != 0;
+  }
+
+  bool isFP64() const {
+    return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
+  }
+
+  int matchRegisterName(StringRef Symbol);
+
+  int matchRegisterByNumber(unsigned RegNum, StringRef Mnemonic);
+
+  void setFpFormat(FpFormatTy Format) {
+    FpFormat = Format;
+  }
+
+  void setDefaultFpFormat();
+
+  void setFpFormat(StringRef Format);
+
+  FpFormatTy getFpFormat() {return FpFormat;}
+
+  bool requestsDoubleOperand(StringRef Mnemonic);
+
+  unsigned getReg(int RC,int RegNo);
+
 public:
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser() {
+    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
 };
 }
 
@@ -50,6 +119,7 @@ namespace {
 /// MipsOperand - Instances of this class represent a parsed Mips machine
 /// instruction.
 class MipsOperand : public MCParsedAsmOperand {
+
   enum KindTy {
     k_CondCode,
     k_CoprocNum,
@@ -61,18 +131,58 @@ class MipsOperand : public MCParsedAsmOperand {
   } Kind;
 
   MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  union {
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+
+    struct {
+      unsigned RegNum;
+    } Reg;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    struct {
+      unsigned Base;
+      const MCExpr *Off;
+    } Mem;
+  };
+
+  SMLoc StartLoc, EndLoc;
+
 public:
   void addRegOperands(MCInst &Inst, unsigned N) const {
-    llvm_unreachable("unimplemented!");
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const{
-    llvm_unreachable("unimplemented!");
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
   }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
-    llvm_unreachable("unimplemented!");
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst,Expr);
   }
+
   void addMemOperands(MCInst &Inst, unsigned N) const {
-    llvm_unreachable("unimplemented!");
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst,Expr);
   }
 
   bool isReg() const { return Kind == k_Register; }
@@ -82,46 +192,751 @@ public:
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
-    return "";
+    return StringRef(Tok.Data, Tok.Length);
   }
 
   unsigned getReg() const {
     assert((Kind == k_Register) && "Invalid access!");
-    return 0;
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
   }
 
+  unsigned getMemBase() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  static MipsOperand *CreateToken(StringRef Str, SMLoc S) {
+    MipsOperand *Op = new MipsOperand(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static MipsOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
+                                 SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Memory);
+    Op->Mem.Base = Base;
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
   virtual void print(raw_ostream &OS) const {
     llvm_unreachable("unimplemented!");
   }
 };
 }
 
+unsigned MipsAsmParser::
+getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                    unsigned OperandNum, unsigned &NumMCOperands) {
+  assert (0 && "getMCInstOperandNum() not supported by the Mips target.");
+  // The Mips backend doesn't currently include the matcher implementation, so
+  // the getMCInstOperandNumImpl() is undefined.  This is a temporary
+  // work around.
+  NumMCOperands = 0;
+  return 0;
+}
+
 bool MipsAsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
+  MCInst Inst;
+  unsigned ErrorInfo;
+  unsigned Kind;
+  unsigned MatchResult = MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo);
+
+  switch (MatchResult) {
+  default: break;
+  case Match_Success: {
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst);
+    return false;
+  }
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MipsOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  }
   return true;
 }
 
+int MipsAsmParser::matchRegisterName(StringRef Name) {
+
+   int CC = StringSwitch<unsigned>(Name)
+    .Case("zero",  Mips::ZERO)
+    .Case("a0",  Mips::A0)
+    .Case("a1",  Mips::A1)
+    .Case("a2",  Mips::A2)
+    .Case("a3",  Mips::A3)
+    .Case("v0",  Mips::V0)
+    .Case("v1",  Mips::V1)
+    .Case("s0",  Mips::S0)
+    .Case("s1",  Mips::S1)
+    .Case("s2",  Mips::S2)
+    .Case("s3",  Mips::S3)
+    .Case("s4",  Mips::S4)
+    .Case("s5",  Mips::S5)
+    .Case("s6",  Mips::S6)
+    .Case("s7",  Mips::S7)
+    .Case("k0",  Mips::K0)
+    .Case("k1",  Mips::K1)
+    .Case("sp",  Mips::SP)
+    .Case("fp",  Mips::FP)
+    .Case("gp",  Mips::GP)
+    .Case("ra",  Mips::RA)
+    .Case("t0",  Mips::T0)
+    .Case("t1",  Mips::T1)
+    .Case("t2",  Mips::T2)
+    .Case("t3",  Mips::T3)
+    .Case("t4",  Mips::T4)
+    .Case("t5",  Mips::T5)
+    .Case("t6",  Mips::T6)
+    .Case("t7",  Mips::T7)
+    .Case("t8",  Mips::T8)
+    .Case("t9",  Mips::T9)
+    .Case("at",  Mips::AT)
+    .Case("fcc0",  Mips::FCC0)
+    .Default(-1);
+
+  if (CC != -1) {
+    //64 bit register in Mips are following 32 bit definitions.
+    if (isMips64())
+      CC++;
+    return CC;
+  }
+
+  if (Name[0] == 'f') {
+    StringRef NumString = Name.substr(1);
+    unsigned IntVal;
+    if( NumString.getAsInteger(10, IntVal))
+      return -1; //not integer
+    if (IntVal > 31)
+      return -1;
+
+    FpFormatTy Format = getFpFormat();
+
+    if (Format == FP_FORMAT_S || Format == FP_FORMAT_W)
+      return getReg(Mips::FGR32RegClassID, IntVal);
+    if (Format == FP_FORMAT_D) {
+      if(isFP64()) {
+        return getReg(Mips::FGR64RegClassID, IntVal);
+      }
+      //only even numbers available as register pairs
+      if (( IntVal > 31) || (IntVal%2 !=  0))
+        return -1;
+      return getReg(Mips::AFGR64RegClassID, IntVal/2);
+    }
+  }
+
+  return -1;
+}
+void MipsAsmParser::setDefaultFpFormat() {
+
+  if (isMips64() || isFP64())
+    FpFormat = FP_FORMAT_D;
+  else
+    FpFormat = FP_FORMAT_S;
+}
+
+bool MipsAsmParser::requestsDoubleOperand(StringRef Mnemonic){
+
+  bool IsDouble = StringSwitch<bool>(Mnemonic.lower())
+    .Case("ldxc1", true)
+    .Case("ldc1",  true)
+    .Case("sdxc1", true)
+    .Case("sdc1",  true)
+    .Default(false);
+
+  return IsDouble;
+}
+void MipsAsmParser::setFpFormat(StringRef Format) {
+
+  FpFormat = StringSwitch<FpFormatTy>(Format.lower())
+    .Case(".s",  FP_FORMAT_S)
+    .Case(".d",  FP_FORMAT_D)
+    .Case(".l",  FP_FORMAT_L)
+    .Case(".w",  FP_FORMAT_W)
+    .Default(FP_FORMAT_NONE);
+}
+
+unsigned MipsAsmParser::getReg(int RC,int RegNo){
+  return *(getContext().getRegisterInfo().getRegClass(RC).begin() + RegNo);
+}
+
+int MipsAsmParser::matchRegisterByNumber(unsigned RegNum,StringRef Mnemonic) {
+
+  if (Mnemonic.lower() == "rdhwr") {
+    //at the moment only hwreg29 is supported
+    if (RegNum != 29)
+      return -1;
+    return Mips::HWR29;
+  }
+
+  if (RegNum > 31)
+    return -1;
+
+  return getReg(Mips::CPURegsRegClassID,RegNum);
+}
+
+int MipsAsmParser::tryParseRegister(StringRef Mnemonic) {
+  const AsmToken &Tok = Parser.getTok();
+  int RegNum = -1;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    std::string lowerCase = Tok.getString().lower();
+    RegNum = matchRegisterName(lowerCase);
+  } else if (Tok.is(AsmToken::Integer))
+    RegNum = matchRegisterByNumber(static_cast<unsigned> (Tok.getIntVal()),
+                                   Mnemonic.lower());
+    else
+      return RegNum;  //error
+  //64 bit div operations require Mips::ZERO instead of MIPS::ZERO_64
+  if (isMips64() && RegNum == Mips::ZERO_64) {
+    if (Mnemonic.find("ddiv") != StringRef::npos)
+      RegNum = Mips::ZERO;
+  }
+  return RegNum;
+}
+
 bool MipsAsmParser::
-ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                          StringRef Mnemonic){
+
+  SMLoc S = Parser.getTok().getLoc();
+  int RegNo = -1;
+
+  //FIXME: we should make a more generic method for CCR
+  if ((Mnemonic == "cfc1" || Mnemonic == "ctc1")
+      && Operands.size() == 2 && Parser.getTok().is(AsmToken::Integer)){
+    RegNo = Parser.getTok().getIntVal();  //get the int value
+    //at the moment only fcc0 is supported
+    if (RegNo ==  0)
+      RegNo = Mips::FCC0;
+  } else
+    RegNo = tryParseRegister(Mnemonic);
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
+      Parser.getTok().getLoc()));
+  Parser.Lex(); // Eat register token.
+  return false;
+}
+
+bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
+                                 StringRef Mnemonic) {
+  //Check if the current operand has a custom associated parser, if so, try to
+  //custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Dollar: {
+    //parse register
+    SMLoc S = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat dollar token.
+    //parse register operand
+    if (!tryParseRegisterOperand(Operands,Mnemonic)) {
+      if (getLexer().is(AsmToken::LParen)) {
+        //check if it is indexed addressing operand
+        Operands.push_back(MipsOperand::CreateToken("(", S));
+        Parser.Lex(); //eat parenthesis
+        if (getLexer().isNot(AsmToken::Dollar))
+          return true;
+
+        Parser.Lex(); //eat dollar
+        if (tryParseRegisterOperand(Operands,Mnemonic))
+          return true;
+
+        if (!getLexer().is(AsmToken::RParen))
+          return true;
+
+        S = Parser.getTok().getLoc();
+        Operands.push_back(MipsOperand::CreateToken(")", S));
+        Parser.Lex();
+      }
+      return false;
+    }
+    //maybe it is a symbol reference
+    StringRef Identifier;
+    if (Parser.ParseIdentifier(Identifier))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+
+    // Otherwise create a symbol ref.
+    const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                getContext());
+
+    Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+    return false;
+  }
+  case AsmToken::Identifier:
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Integer:
+  case AsmToken::String: {
+     // quoted label names
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc();
+    if (getParser().ParseExpression(IdVal))
+      return true;
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  }
+  case AsmToken::Percent: {
+    //it is a symbol reference or constant expression
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc(); //start location of the operand
+    if (parseRelocOperand(IdVal))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  }//case AsmToken::Percent
+  }//switch(getLexer().getKind())
+  return true;
+}
+
+bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
+
+  Parser.Lex(); //eat % token
+  const AsmToken &Tok = Parser.getTok(); //get next token, operation
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+
+  std::string Str = Tok.getIdentifier().str();
+
+  Parser.Lex(); //eat identifier
+  //now make expression from the rest of the operand
+  const MCExpr *IdVal;
+  SMLoc EndLoc;
+
+  if (getLexer().getKind() == AsmToken::LParen) {
+    while (1) {
+      Parser.Lex(); //eat '(' token
+      if (getLexer().getKind() == AsmToken::Percent) {
+        Parser.Lex(); //eat % token
+        const AsmToken &nextTok = Parser.getTok();
+        if (nextTok.isNot(AsmToken::Identifier))
+          return true;
+        Str += "(%";
+        Str += nextTok.getIdentifier();
+        Parser.Lex(); //eat identifier
+        if (getLexer().getKind() != AsmToken::LParen)
+          return true;
+      } else
+        break;
+    }
+    if (getParser().ParseParenExpression(IdVal,EndLoc))
+      return true;
+
+    while (getLexer().getKind() == AsmToken::RParen)
+      Parser.Lex(); //eat ')' token
+
+  } else
+    return true; //parenthesis must follow reloc operand
+
+  //Check the type of the expression
+  if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal)) {
+    //it's a constant, evaluate lo or hi value
+    int Val = MCE->getValue();
+    if (Str == "lo") {
+      Val = Val & 0xffff;
+    } else if (Str == "hi") {
+      Val = (Val & 0xffff0000) >> 16;
+    }
+    Res = MCConstantExpr::Create(Val, getContext());
+    return false;
+  }
+
+  if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(IdVal)) {
+    //it's a symbol, create symbolic expression from symbol
+    StringRef Symbol = MSRE->getSymbol().getName();
+    MCSymbolRefExpr::VariantKind VK = getVariantKind(Str);
+    Res = MCSymbolRefExpr::Create(Symbol,VK,getContext());
+    return false;
+  }
   return true;
 }
 
+bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                  SMLoc &EndLoc) {
+
+  StartLoc = Parser.getTok().getLoc();
+  RegNo = tryParseRegister("");
+  EndLoc = Parser.getTok().getLoc();
+  return (RegNo == (unsigned)-1);
+}
+
+bool MipsAsmParser::parseMemOffset(const MCExpr *&Res) {
+
+  SMLoc S;
+
+  switch(getLexer().getKind()) {
+  default:
+    return true;
+  case AsmToken::Integer:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+    return (getParser().ParseExpression(Res));
+  case AsmToken::Percent:
+    return parseRelocOperand(Res);
+  case AsmToken::LParen:
+    return false;  //it's probably assuming 0
+  }
+  return true;
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
+               SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
+
+  const MCExpr *IdVal = 0;
+  SMLoc S;
+  //first operand is the offset
+  S = Parser.getTok().getLoc();
+
+  if (parseMemOffset(IdVal))
+    return MatchOperand_ParseFail;
+
+  const AsmToken &Tok = Parser.getTok(); //get next token
+  if (Tok.isNot(AsmToken::LParen)) {
+    Error(Parser.getTok().getLoc(), "'(' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat '(' token.
+
+  const AsmToken &Tok1 = Parser.getTok(); //get next token
+  if (Tok1.is(AsmToken::Dollar)) {
+    Parser.Lex(); // Eat '$' token.
+    if (tryParseRegisterOperand(Operands,"")) {
+      Error(Parser.getTok().getLoc(), "unexpected token in operand");
+      return MatchOperand_ParseFail;
+    }
+
+  } else {
+    Error(Parser.getTok().getLoc(),"unexpected token in operand");
+    return MatchOperand_ParseFail;
+  }
+
+  const AsmToken &Tok2 = Parser.getTok(); //get next token
+  if (Tok2.isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "')' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  Parser.Lex(); // Eat ')' token.
+
+  if (IdVal == 0)
+    IdVal = MCConstantExpr::Create(0, getContext());
+
+  //now replace register operand with the mem operand
+  MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+  int RegNo = op->getReg();
+  //remove register from operands
+  Operands.pop_back();
+  //and add memory operand
+  Operands.push_back(MipsOperand::CreateMem(RegNo, IdVal, S, E));
+  delete op;
+  return MatchOperand_Success;
+}
+
+MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
+
+  MCSymbolRefExpr::VariantKind VK
+                   = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+    .Case("hi",          MCSymbolRefExpr::VK_Mips_ABS_HI)
+    .Case("lo",          MCSymbolRefExpr::VK_Mips_ABS_LO)
+    .Case("gp_rel",      MCSymbolRefExpr::VK_Mips_GPREL)
+    .Case("call16",      MCSymbolRefExpr::VK_Mips_GOT_CALL)
+    .Case("got",         MCSymbolRefExpr::VK_Mips_GOT)
+    .Case("tlsgd",       MCSymbolRefExpr::VK_Mips_TLSGD)
+    .Case("tlsldm",      MCSymbolRefExpr::VK_Mips_TLSLDM)
+    .Case("dtprel_hi",   MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+    .Case("dtprel_lo",   MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+    .Case("gottprel",    MCSymbolRefExpr::VK_Mips_GOTTPREL)
+    .Case("tprel_hi",    MCSymbolRefExpr::VK_Mips_TPREL_HI)
+    .Case("tprel_lo",    MCSymbolRefExpr::VK_Mips_TPREL_LO)
+    .Case("got_disp",    MCSymbolRefExpr::VK_Mips_GOT_DISP)
+    .Case("got_page",    MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+    .Case("got_ofst",    MCSymbolRefExpr::VK_Mips_GOT_OFST)
+    .Case("hi(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+    .Case("lo(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+    .Default(MCSymbolRefExpr::VK_None);
+
+  return VK;
+}
+
+static int ConvertCcString(StringRef CondString) {
+  int CC = StringSwitch<unsigned>(CondString)
+      .Case(".f",    0)
+      .Case(".un",   1)
+      .Case(".eq",   2)
+      .Case(".ueq",  3)
+      .Case(".olt",  4)
+      .Case(".ult",  5)
+      .Case(".ole",  6)
+      .Case(".ule",  7)
+      .Case(".sf",   8)
+      .Case(".ngle", 9)
+      .Case(".seq",  10)
+      .Case(".ngl",  11)
+      .Case(".lt",   12)
+      .Case(".nge",  13)
+      .Case(".le",   14)
+      .Case(".ngt",  15)
+      .Default(-1);
+
+  return CC;
+}
+
+bool MipsAsmParser::
+parseMathOperation(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  //split the format
+  size_t Start = Name.find('.'), Next = Name.rfind('.');
+  StringRef Format1 = Name.slice(Start, Next);
+  //and add the first format to the operands
+  Operands.push_back(MipsOperand::CreateToken(Format1, NameLoc));
+  //now for the second format
+  StringRef Format2 = Name.slice(Next, StringRef::npos);
+  Operands.push_back(MipsOperand::CreateToken(Format2, NameLoc));
+
+  //set the format for the first register
+  setFpFormat(Format1);
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+
+    }
+    Parser.Lex();  // Eat the comma.
+
+    //set the format for the first register
+    setFpFormat(Format2);
+
+    // Parse and remember the operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.EatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
 bool MipsAsmParser::
 ParseInstruction(StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  return true;
+  //floating point instructions: should register be treated as double?
+  if (requestsDoubleOperand(Name)) {
+    setFpFormat(FP_FORMAT_D);
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
+  }
+  else {
+    setDefaultFpFormat();
+    // Create the leading tokens for the mnemonic, split by '.' characters.
+    size_t Start = 0, Next = Name.find('.');
+    StringRef Mnemonic = Name.slice(Start, Next);
+
+    Operands.push_back(MipsOperand::CreateToken(Mnemonic, NameLoc));
+
+    if (Next != StringRef::npos) {
+      //there is a format token in mnemonic
+      //StringRef Rest = Name.slice(Next, StringRef::npos);
+      size_t Dot = Name.find('.', Next+1);
+      StringRef Format = Name.slice(Next, Dot);
+      if (Dot == StringRef::npos) //only one '.' in a string, it's a format
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      else {
+        if (Name.startswith("c.")){
+          // floating point compare, add '.' and immediate represent for cc
+          Operands.push_back(MipsOperand::CreateToken(".", NameLoc));
+          int Cc = ConvertCcString(Format);
+          if (Cc == -1) {
+            return Error(NameLoc, "Invalid conditional code");
+          }
+          SMLoc E = SMLoc::getFromPointer(
+              Parser.getTok().getLoc().getPointer() -1 );
+          Operands.push_back(MipsOperand::CreateImm(
+              MCConstantExpr::Create(Cc, getContext()), NameLoc, E));
+        } else {
+          //trunc, ceil, floor ...
+          return parseMathOperation(Name, NameLoc, Operands);
+        }
+
+        //the rest is a format
+        Format = Name.slice(Dot, StringRef::npos);
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      }
+
+      setFpFormat(Format);
+    }
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    while (getLexer().is(AsmToken::Comma) ) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (ParseOperand(Operands, Name)) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.EatToEndOfStatement();
+        return Error(Loc, "unexpected token in argument list");
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.EatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
 }
 
 bool MipsAsmParser::
 ParseDirective(AsmToken DirectiveID) {
-  return true;
-}
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&) {
-  return MatchOperand_ParseFail;
+  if (DirectiveID.getString() == ".ent") {
+    //ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".end") {
+    //ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".frame") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".set") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".fmask") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".mask") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".gpword") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  return true;
 }
 
 extern "C" void LLVMInitializeMipsAsmParser() {
@@ -130,3 +945,7 @@ extern "C" void LLVMInitializeMipsAsmParser() {
   RegisterMCAsmParser<MipsAsmParser> A(TheMips64Target);
   RegisterMCAsmParser<MipsAsmParser> B(TheMips64elTarget);
 }
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MipsGenAsmMatcher.inc"
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index f535c50..0f84358 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_target(MipsCodeGen
   MipsAsmPrinter.cpp
   MipsCodeEmitter.cpp
   MipsDelaySlotFiller.cpp
+  MipsDirectObjLower.cpp
   MipsELFWriterInfo.cpp
   MipsJITInfo.cpp
   MipsInstrInfo.cpp
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index 234455e..9603327 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -122,7 +122,7 @@ inline static unsigned getMipsRegisterNumbering(unsigned RegEnum)
 {
   switch (RegEnum) {
   case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64:
-  case Mips::D0:
+  case Mips::D0:   case Mips::FCC0:
     return 0;
   case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64:
     return 1;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index b8489ca..5d240fe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -56,7 +56,7 @@ namespace {
 MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
                                          bool _isN64, bool IsLittleEndian)
   : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
-                            /*HasRelocationAddend*/ false,
+                            /*HasRelocationAddend*/ (_isN64) ? true : false,
                             /*IsN64*/ _isN64) {}
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 8dab62d..1d7370a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -143,7 +143,11 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isExpr() && "getBranchTargetOpValue expects only expressions");
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
 
   const MCExpr *Expr = MO.getExpr();
   Fixups.push_back(MCFixup::Create(0, Expr,
@@ -159,7 +163,10 @@ getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isExpr() && "getJumpTargetOpValue expects only expressions");
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValue expects only expressions or an immediate");
 
   const MCExpr *Expr = MO.getExpr();
   Fixups.push_back(MCFixup::Create(0, Expr,
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 2bc286b..ec84ad8 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm, /* FIXME: set mips16 unconditional br */ 0),
-    RI(*tm.getSubtargetImpl(), *this) {}
+    RI(*tm.getSubtargetImpl()) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
   return RI;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index c15d1bf..106e82f 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -38,9 +38,8 @@
 
 using namespace llvm;
 
-Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST,
-                                       const TargetInstrInfo &TII)
-  : MipsRegisterInfo(ST, TII) {}
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
 
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 3f4b3a7..c702a15 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -17,11 +17,11 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
+class Mips16InstrInfo;
 
 class Mips16RegisterInfo : public MipsRegisterInfo {
 public:
-  Mips16RegisterInfo(const MipsSubtarget &Subtarget,
-                     const TargetInstrInfo &TII);
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget);
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 20fc178..147be5d 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -110,9 +110,9 @@ def DSLLV    : shift_rotate_reg<0x14, 0x00, "dsllv", shl, CPU64Regs>;
 def DSRLV    : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>;
 def DSRAV    : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>;
 let Pattern = []<dag> in {
-def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
-def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
-def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
+  def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
+  def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
+  def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
 }
 }
 // Rotate Instructions
@@ -217,7 +217,15 @@ let DecoderNamespace = "Mips64" in {
 def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>;
 
 def DEXT : ExtBase<3, "dext", CPU64Regs>;
+let Pattern = []<dag> in {
+  def DEXTU : ExtBase<2, "dextu", CPU64Regs>;
+  def DEXTM : ExtBase<1, "dextm", CPU64Regs>;
+}
 def DINS : InsBase<7, "dins", CPU64Regs>;
+let Pattern = []<dag> in {
+  def DINSU : InsBase<6, "dinsu", CPU64Regs>;
+  def DINSM : InsBase<5, "dinsm", CPU64Regs>;
+}
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index dc8fbd0..99b163e 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -91,7 +91,7 @@ void MipsAnalyzeImmediate::ReplaceADDiuSLLWithLUi(InstSeq &Seq) {
 
   // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit.
   int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd);
-  int64_t ShiftedImm = Imm << (Seq[1].ImmOpnd - 16);
+  int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16);
 
   if (!isInt<16>(ShiftedImm))
     return;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 00ff754..e780134 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "mips-asm-printer"
 #include "Mips.h"
 #include "MipsAsmPrinter.h"
+#include "MipsDirectObjLower.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
 #include "InstPrinter/MipsInstPrinter.h"
@@ -58,33 +59,31 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  // Direct object specific instruction lowering
-  if (!OutStreamer.hasRawTextSupport())
-    switch (MI->getOpcode()) {
-    case Mips::DSLL:
-    case Mips::DSRL:
-    case Mips::DSRA:
-      assert(MI->getNumOperands() == 3 &&
-             "Invalid no. of machine operands for shift!");
-      assert(MI->getOperand(2).isImm());
-      int64_t Shift = MI->getOperand(2).getImm();
-      if (Shift > 31) {
-        MCInst TmpInst0;
-        MCInstLowering.LowerLargeShift(MI, TmpInst0, Shift - 32);
-        OutStreamer.EmitInstruction(TmpInst0);
-        return;
-      }
-      break;
-    }
-
   MachineBasicBlock::const_instr_iterator I = MI;
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
   do {
     MCInst TmpInst0;
     MCInstLowering.Lower(I++, TmpInst0);
+
+    // Direct object specific instruction lowering
+    if (!OutStreamer.hasRawTextSupport()){
+      switch (TmpInst0.getOpcode()) {
+      // If shift amount is >= 32 it the inst needs to be lowered further
+      case Mips::DSLL:
+      case Mips::DSRL:
+      case Mips::DSRA:
+        Mips::LowerLargeShift(TmpInst0);
+        break;
+        // Double extract instruction is chosen by pos and size operands
+      case Mips::DEXT:
+      case Mips::DINS:
+        Mips::LowerDextDins(TmpInst0);
+      }
+    }
+
     OutStreamer.EmitInstruction(TmpInst0);
-  } while ((I != E) && I->isInsideBundle());
+  } while ((I != E) && I->isInsideBundle()); // Delay slot check
 }
 
 //===----------------------------------------------------------------------===//
@@ -214,7 +213,7 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
   case MipsSubtarget::N32:  return "abiN32";
   case MipsSubtarget::N64:  return "abi64";
   case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
-  default: llvm_unreachable("Unknown Mips ABI");;
+  default: llvm_unreachable("Unknown Mips ABI");
   }
 }
 
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index cb7022b..5433295 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -139,7 +138,7 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
 
   do {
     DEBUG(errs() << "JITTing function '"
-        << MF.getFunction()->getName() << "'\n");
+        << MF.getName() << "'\n");
     MCE.startFunction(MF);
 
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 2bba8a3..e3c8ed7 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -30,10 +30,11 @@ STATISTIC(FilledSlots, "Number of delay slots filled");
 STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
                        " are not NOP.");
 
-static cl::opt<bool> EnableDelaySlotFiller(
-  "enable-mips-delay-filler",
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-mips-delay-filler",
   cl::init(false),
-  cl::desc("Fill the Mips delay slots useful instructions."),
+  cl::desc("Disable the delay slot filler, which attempts to fill the Mips"
+           "delay slots with useful instructions."),
   cl::Hidden);
 
 // This option can be used to silence complaints by machine verifier passes.
@@ -114,7 +115,9 @@ runOnMachineBasicBlock(MachineBasicBlock &MBB) {
 
       InstrIter D;
 
-      if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) {
+      // Delay slot filling is disabled at -O0.
+      if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None) &&
+          findDelayInstr(MBB, I, D)) {
         MBB.splice(llvm::next(I), &MBB, D);
         ++UsefulSlots;
       } else
diff --git a/lib/Target/Mips/MipsDirectObjLower.cpp b/lib/Target/Mips/MipsDirectObjLower.cpp
new file mode 100644
index 0000000..0d74db8
--- /dev/null
+++ b/lib/Target/Mips/MipsDirectObjLower.cpp
@@ -0,0 +1,86 @@
+//===-- MipsDirectObjLower.cpp - Mips LLVM direct object lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MCInst records that are normally
+// left to the assembler to lower such as large shifts.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsDirectObjLower.h"
+#include "MipsInstrInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+void Mips::LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  bool isLarge = false;
+  int64_t Shift;
+  Shift = Inst.getOperand(2).getImm();
+  if (Shift > 31) {
+    Shift -= 32;
+    isLarge = true;
+  }
+
+  // saminus32
+  (Inst.getOperand(2)).setImm(Shift);
+
+  if (isLarge)
+    switch (Inst.getOpcode()) {
+    default:
+      // Calling function is not synchronized
+      llvm_unreachable("Unexpected shift instruction");
+    case Mips::DSLL:
+      Inst.setOpcode(Mips::DSLL32);
+      return;
+    case Mips::DSRL:
+      Inst.setOpcode(Mips::DSRL32);
+      return;
+    case Mips::DSRA:
+      Inst.setOpcode(Mips::DSRA32);
+      return;
+    }
+}
+
+// Pick a DEXT or DINS instruction variant based on the pos and size operands
+void Mips::LowerDextDins(MCInst& InstIn) {
+  int Opcode = InstIn.getOpcode();
+
+  if (Opcode == Mips::DEXT)
+    assert(InstIn.getNumOperands() == 4 &&
+           "Invalid no. of machine operands for DEXT!");
+  else // Only DEXT and DINS are possible
+    assert(InstIn.getNumOperands() == 5 &&
+           "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if ((pos < 32)) { // DEXT/DINS, do nothing
+      return;
+    } else { // DEXTU/DINSU
+      InstIn.getOperand(2).setImm(pos - 32);
+      InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+      return;
+    }
+  } else { // DEXTM/DINSM
+    assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+    InstIn.getOperand(3).setImm(size - 32);
+    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+    return;
+  }
+}
diff --git a/lib/Target/Mips/MipsDirectObjLower.h b/lib/Target/Mips/MipsDirectObjLower.h
new file mode 100644
index 0000000..8813cc9
--- /dev/null
+++ b/lib/Target/Mips/MipsDirectObjLower.h
@@ -0,0 +1,28 @@
+//===-- MipsDirectObjLower.h - Mips LLVM direct object lowering *- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSDIRECTOBJLOWER_H
+#define MIPSDIRECTOBJLOWER_H
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCInst;
+  class MCStreamer;
+
+  namespace Mips {
+  /// MipsDirectObjLower - This name space is used to lower MCInstr in cases
+  //                       where the assembler usually finishes the lowering
+  //                       such as large shifts.
+    void LowerLargeShift(MCInst &Inst);
+    void LowerDextDins(MCInst &Inst);
+  }
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 5a97c17..4205223 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -337,8 +337,9 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
     // Generate:
     //  lui $2, %hi($CPI1_0)
     //  lwc1 $f0, %lo($CPI1_0)($2)
-    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo) {
-      SDValue LoVal = Addr.getOperand(1), Opnd0 = LoVal.getOperand(0);
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
       if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
           isa<JumpTableSDNode>(Opnd0)) {
         Base = Addr.getOperand(0);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index c5207c6..aa7b459 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -1571,15 +1571,15 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     SDVTList VTs = DAG.getVTList(MVT::i32);
 
-    MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering();
+    const MipsTargetObjectFile &TLOF = (const MipsTargetObjectFile&)getObjFileLowering();
 
     // %gp_rel relocation
     if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
       SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
                                               MipsII::MO_GPREL);
       SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, &GA, 1);
-      SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
-      return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode);
+      SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
+      return DAG.getNode(ISD::ADD, dl, MVT::i32, GPReg, GPRelNode);
     }
     // %hi/%lo relocation
     SDValue GAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 50e3eb5..8ade891 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -262,46 +262,3 @@ unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   }
   }
 }
-
-unsigned
-llvm::Mips::loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
-                          MachineBasicBlock& MBB,
-                          MachineBasicBlock::iterator II, DebugLoc DL,
-                          bool LastInstrIsADDiu,
-                          MipsAnalyzeImmediate::Inst *LastInst) {
-  MipsAnalyzeImmediate AnalyzeImm;
-  unsigned Size = IsN64 ? 64 : 32;
-  unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi;
-  unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT;
-
-  const MipsAnalyzeImmediate::InstSeq &Seq =
-    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
-  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-
-  if (LastInst && (Seq.size() == 1)) {
-    *LastInst = *Inst;
-    return 0;
-  }
-
-  // The first instruction can be a LUi, which is different from other
-  // instructions (ADDiu, ORI and SLL) in that it does not have a register
-  // operand.
-  if (Inst->Opc == LUi)
-    BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-  else
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  // Build the remaining instructions in Seq. Skip the last instruction if
-  // LastInst is not 0.
-  for (++Inst; Inst != Seq.end() - !!LastInst; ++Inst)
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  if (LastInst)
-    *LastInst = *Inst;
-
-  return Seq.size() - !!LastInst;
-}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 7d56259..aca2bc7 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -88,18 +88,6 @@ private:
                    const SmallVectorImpl<MachineOperand>& Cond) const;
 };
 
-namespace Mips {
-  /// Emit a series of instructions to load an immediate. All instructions
-  /// except for the last one are emitted. The function returns the number of
-  /// MachineInstrs generated. The opcode-immediate pair of the last
-  /// instruction is returned in LastInst, if it is not 0.
-  unsigned
-  loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
-                MachineBasicBlock& MBB, MachineBasicBlock::iterator II,
-                DebugLoc DL, bool LastInstrIsADDiu,
-                MipsAnalyzeImmediate::Inst *LastInst);
-}
-
 /// Create MipsInstrInfo objects.
 const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM);
 const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM);
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index fd952ef..6b6005f 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -74,9 +74,10 @@ def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
-                           [SDNPHasChain, SDNPOutGlue]>;
+                           [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+                           [SDNPHasChain, SDNPSideEffect,
+                            SDNPOptInGlue, SDNPOutGlue]>;
 
 // MAdd*/MSub* nodes
 def MipsMAdd      : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub,
@@ -110,7 +111,7 @@ def MipsWrapper    : SDNode<"MipsISD::Wrapper", SDTIntBinOp>;
 def MipsDynAlloc  : SDNode<"MipsISD::DynAlloc", SDT_MipsDynAlloc,
                            [SDNPHasChain, SDNPInGlue]>;
 
-def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>;
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
 
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
@@ -1079,6 +1080,26 @@ def EXT : ExtBase<0, "ext", CPURegs>;
 def INS : InsBase<4, "ins", CPURegs>;
 
 //===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : InstAlias<"move $dst,$src", (ADD CPURegs:$dst,CPURegs:$src,ZERO)>;
+def : InstAlias<"bal $offset", (BGEZAL RA,brtarget:$offset)>;
+def : InstAlias<"addu $rs,$rt,$imm",
+                (ADDiu CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"add $rs,$rt,$imm",
+                (ADDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"and $rs,$rt,$imm",
+                (ANDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"j $rs", (JR CPURegs:$rs)>;
+def : InstAlias<"not $rt,$rs", (NOR CPURegs:$rt,CPURegs:$rs,ZERO)>;
+def : InstAlias<"neg $rt,$rs", (SUB CPURegs:$rt,ZERO,CPURegs:$rs)>;
+def : InstAlias<"negu $rt,$rs", (SUBu CPURegs:$rt,ZERO,CPURegs:$rs)>;
+def : InstAlias<"slt $rs,$rt,$imm",
+                (SLTi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"xor $rs,$rt,$imm",
+                (XORi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+
+//===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index f78203f..b9dbd52 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -10,6 +10,10 @@
 // This pass expands a branch or jump instruction into a long branch if its
 // offset is too large to fit into its immediate field.
 //
+// FIXME: 
+// 1. Fix pc-region jump instructions which cross 256MB segment boundaries. 
+// 2. If program has inline assembly statements whose size cannot be
+//    determined accurately, load branch target addresses from the GOT. 
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-long-branch"
@@ -48,7 +52,7 @@ namespace {
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
   struct MBBInfo {
-    uint64_t Size;
+    uint64_t Size, Address;
     bool HasLongBranch;
     MachineInstr *Br;
 
@@ -61,7 +65,10 @@ namespace {
     static char ID;
     MipsLongBranch(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
-        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())) {}
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
 
     virtual const char *getPassName() const {
       return "Mips Long Branch";
@@ -81,6 +88,9 @@ namespace {
     const MipsInstrInfo *TII;
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
+    bool IsPIC;
+    unsigned ABI;
+    unsigned LongBranchSeqSize;
   };
 
   char MipsLongBranch::ID = 0;
@@ -230,12 +240,6 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
 
 // Expand branch instructions to long branches.
 void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
-  I.HasLongBranch = true;
-
-  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-  unsigned ABI = TM.getSubtarget<MipsSubtarget>().getTargetABI();
-  bool N64 = ABI == MipsSubtarget::N64;
-
   MachineBasicBlock::iterator Pos;
   MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
   DebugLoc DL = I.Br->getDebugLoc();
@@ -248,101 +252,105 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   MBB->addSuccessor(LongBrMBB);
 
   if (IsPIC) {
-    // $longbr:
-    //  addiu $sp, $sp, -regsize * 2
-    //  sw $ra, 0($sp)
-    //  bal $baltgt
-    //  sw $a3, regsize($sp)
-    // $baltgt:
-    //  lui $a3, %hi($baltgt)
-    //  lui $at, %hi($tgt)
-    //  addiu $a3, $a3, %lo($baltgt)
-    //  addiu $at, $at, %lo($tgt)
-    //  subu $at, $at, $a3
-    //  addu $at, $ra, $at
-    //
-    //  if n64:
-    //   lui $a3, %highest($baltgt)
-    //   lui $ra, %highest($tgt)
-    //   addiu $a3, $a3, %higher($baltgt)
-    //   addiu $ra, $ra, %higher($tgt)
-    //   dsll $a3, $a3, 32
-    //   dsll $ra, $ra, 32
-    //   subu $at, $at, $a3
-    //   addu $at, $at, $ra
-    //
-    //  lw $ra, 0($sp)
-    //  lw $a3, regsize($sp)
-    //  jr $at
-    //  addiu $sp, $sp, regsize * 2
-    // $fallthrough:
-    //
-    MF->getInfo<MipsFunctionInfo>()->setEmitNOAT();
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
     MF->insert(FallThroughMBB, BalTgtMBB);
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
-    int RegSize = N64 ? 8 : 4;
-    unsigned AT = N64 ? Mips::AT_64 : Mips::AT;
-    unsigned A3 = N64 ? Mips::A3_64 : Mips::A3;
-    unsigned SP = N64 ? Mips::SP_64 : Mips::SP;
-    unsigned RA = N64 ? Mips::RA_64 : Mips::RA;
-    unsigned Load = N64 ? Mips::LD_P8 : Mips::LW;
-    unsigned Store = N64 ? Mips::SD_P8 : Mips::SW;
-    unsigned LUi = N64 ? Mips::LUi64 : Mips::LUi;
-    unsigned ADDiu = N64 ? Mips::DADDiu : Mips::ADDiu;
-    unsigned ADDu = N64 ? Mips::DADDu : Mips::ADDu;
-    unsigned SUBu = N64 ? Mips::SUBu : Mips::SUBu;
-    unsigned JR = N64 ? Mips::JR64 : Mips::JR;
-
-    Pos = LongBrMBB->begin();
-
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
-      .addImm(-RegSize * 2);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(RA).addReg(SP)
-      .addImm(0);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(A3).addReg(SP)
-      .addImm(RegSize)->setIsInsideBundle();
-
-    Pos = BalTgtMBB->begin();
-
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
-      .addMBB(BalTgtMBB, MipsII::MO_ABS_HI);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), AT)
-      .addMBB(TgtMBB, MipsII::MO_ABS_HI);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
-      .addMBB(BalTgtMBB, MipsII::MO_ABS_LO);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), AT).addReg(AT)
-      .addMBB(TgtMBB, MipsII::MO_ABS_LO);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(RA).addReg(AT);
-
-    if (N64) {
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
-        .addMBB(BalTgtMBB, MipsII::MO_HIGHEST);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), RA)
-        .addMBB(TgtMBB, MipsII::MO_HIGHEST);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
-        .addMBB(BalTgtMBB, MipsII::MO_HIGHER);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), RA).addReg(RA)
-        .addMBB(TgtMBB, MipsII::MO_HIGHER);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), A3).addReg(A3)
-        .addImm(32);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), RA).addReg(RA)
-        .addImm(32);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(AT).addReg(RA);
-      I.Size += 4 * 8;
+    int64_t TgtAddress = MBBInfos[TgtMBB->getNumber()].Address;
+    int64_t Offset = TgtAddress - (I.Address + I.Size - 20);
+    int64_t Lo = SignExtend64<16>(Offset & 0xffff);
+    int64_t Hi = SignExtend64<16>(((Offset + 0x8000) >> 16) & 0xffff);
+
+    if (ABI != MipsSubtarget::N64) {
+      // $longbr:
+      //  addiu $sp, $sp, -8
+      //  sw $ra, 0($sp)
+      //  bal $baltgt
+      //  lui $at, %hi($tgt - $baltgt)
+      // $baltgt:
+      //  addiu $at, $at, %lo($tgt - $baltgt)
+      //  addu $at, $ra, $at
+      //  lw $ra, 0($sp)
+      //  jr $at
+      //  addiu $sp, $sp, 8
+      // $fallthrough:
+      //
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(-8);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi), Mips::AT).addImm(Hi)
+        ->setIsInsideBundle();
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::AT)
+        .addReg(Mips::AT).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
+        .addReg(Mips::RA).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(8)->setIsInsideBundle();
+    } else {
+      // $longbr:
+      //  daddiu $sp, $sp, -16
+      //  sd $ra, 0($sp)
+      //  lui64 $at, %highest($tgt - $baltgt)
+      //  daddiu $at, $at, %higher($tgt - $baltgt)
+      //  dsll $at, $at, 16
+      //  daddiu $at, $at, %hi($tgt - $baltgt)
+      //  bal $baltgt
+      //  dsll $at, $at, 16
+      // $baltgt:
+      //  daddiu $at, $at, %lo($tgt - $baltgt)
+      //  daddu $at, $ra, $at
+      //  ld $ra, 0($sp)
+      //  jr64 $at
+      //  daddiu $sp, $sp, 16
+      // $fallthrough:
+      //
+
+      int64_t Higher = SignExtend64<16>(((Offset + 0x80008000) >> 32) & 0xffff);
+      int64_t Highest =
+        SignExtend64<16>(((Offset + 0x800080008000LL) >> 48) & 0xffff);
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(-16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi64), Mips::AT_64)
+        .addImm(Highest);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Higher);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Hi);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16)->setIsInsideBundle();
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
+        .addReg(Mips::RA_64).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(16)->setIsInsideBundle();
     }
-
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), RA).addReg(SP).addImm(0);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), A3).addReg(SP).addImm(RegSize);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(JR)).addReg(AT);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
-      .addImm(RegSize * 2)->setIsInsideBundle();
-    I.Size += 4 * 14;
   } else {
     // $longbr:
     //  j $tgt
@@ -353,7 +361,6 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     LongBrMBB->addSuccessor(TgtMBB);
     BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB);
     BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::NOP))->setIsInsideBundle();
-    I.Size += 4 * 2;
   }
 
   if (I.Br->isUnconditionalBranch()) {
@@ -401,19 +408,36 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       if (!I->Br || I->HasLongBranch)
         continue;
 
-      if (!ForceLongBranch)
-        // Check if offset fits into 16-bit immediate field of branches.
-        if (isInt<16>(computeOffset(I->Br) / 4))
-          continue;
+      // Check if offset fits into 16-bit immediate field of branches.
+      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4))
+        continue;
 
-      expandToLongBranch(*I);
+      I->HasLongBranch = true;
+      I->Size += LongBranchSeqSize * 4;
       ++LongBranches;
       EverMadeChange = MadeChange = true;
     }
   }
 
-  if (EverMadeChange)
-    MF->RenumberBlocks();
+  if (!EverMadeChange)
+    return true;
+
+  // Compute basic block addresses.
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    MF->getInfo<MipsFunctionInfo>()->setEmitNOAT();
+
+    uint64_t Address = 0;
+
+    for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
+      I->Address = Address;
+  }
+
+  // Do the expansion.
+  for (I = MBBInfos.begin(); I != E; ++I)
+    if (I->HasLongBranch)
+      expandToLongBranch(*I);
+
+  MF->RenumberBlocks();
 
   return true;
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index d4c5e6d..5fa6339 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -11,7 +11,6 @@
 // MCInst records.
 //
 //===----------------------------------------------------------------------===//
-
 #include "MipsMCInstLower.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
@@ -161,31 +160,3 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 }
 
-// If the D<shift> instruction has a shift amount that is greater
-// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
-void MipsMCInstLower::LowerLargeShift(const MachineInstr *MI,
-                                      MCInst& Inst,
-                                      int64_t Shift) {
-  // rt
-  Inst.addOperand(LowerOperand(MI->getOperand(0)));
-  // rd
-  Inst.addOperand(LowerOperand(MI->getOperand(1)));
-  // saminus32
-  Inst.addOperand(MCOperand::CreateImm(Shift));
-
-  switch (MI->getOpcode()) {
-  default:
-    // Calling function is not synchronized
-    llvm_unreachable("Unexpected shift instruction");
-    break;
-  case Mips::DSLL:
-    Inst.setOpcode(Mips::DSLL32);
-    break;
-  case Mips::DSRL:
-    Inst.setOpcode(Mips::DSRL32);
-    break;
-  case Mips::DSRA:
-    Inst.setOpcode(Mips::DSRA32);
-    break;
-  }
-}
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 0abb996..3eab5a4 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -33,7 +33,6 @@ public:
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
   void Initialize(Mangler *mang, MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-  void LowerLargeShift(const MachineInstr *MI, MCInst &Inst, int64_t Shift);
 
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index ae6ae3a..79a142a 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/Type.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -43,9 +42,8 @@
 
 using namespace llvm;
 
-MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST,
-                                   const TargetInstrInfo &tii)
-  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST), TII(tii) {}
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
+  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {}
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
@@ -131,6 +129,12 @@ getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(Mips::RA_64);
   }
 
+  // Reserve GP if small section is used.
+  if (Subtarget.useSmallSection()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
   return Reserved;
 }
 
@@ -160,7 +164,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
            "Instr doesn't have FrameIndex operand!");
   }
 
-  DEBUG(errs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
         errs() << "<--------->\n" << MI);
 
   int FrameIndex = MI.getOperand(i).getIndex();
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 9a05e94..78adf7f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -22,16 +22,14 @@
 
 namespace llvm {
 class MipsSubtarget;
-class TargetInstrInfo;
 class Type;
 
 class MipsRegisterInfo : public MipsGenRegisterInfo {
 protected:
   const MipsSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 
 public:
-  MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii);
+  MipsRegisterInfo(const MipsSubtarget &Subtarget);
 
   /// getRegisterNumbering - Given the enum value for some register, e.g.
   /// Mips::RA, return the number that it corresponds to (e.g. 31).
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index eeb1de3..e4b44ef 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -260,14 +260,53 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
   if (isInt<16>(Amount))// addi sp, sp, amount
     BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
   else { // Expand immediate that doesn't fit in 16-bit.
-    unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
-
     MBB.getParent()->getInfo<MipsFunctionInfo>()->setEmitNOAT();
-    Mips::loadImmediate(Amount, STI.isABI_N64(), *this, MBB, I, DL, false, 0);
-    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(ATReg);
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, 0);
+    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg);
   }
 }
 
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned
+MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator II, DebugLoc DL,
+                               unsigned *NewImm) const {
+  MipsAnalyzeImmediate AnalyzeImm;
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  unsigned Size = STI.isABI_N64() ? 64 : 32;
+  unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
+  bool LastInstrIsADDiu = NewImm;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1)));
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, get(LUi), ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, get(Inst->Opc), ATReg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq.
+  for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst)
+    BuildMI(MBB, II, DL, get(Inst->Opc), ATReg).addReg(ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInstrIsADDiu)
+    *NewImm = Inst->ImmOpnd;
+
+  return ATReg;
+}
+
 unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
           Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 346e74d..55b78b2 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -15,7 +15,6 @@
 #define MIPSSEINSTRUCTIONINFO_H
 
 #include "MipsInstrInfo.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsSERegisterInfo.h"
 
 namespace llvm {
@@ -70,6 +69,13 @@ public:
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 
+  /// Emit a series of instructions to load an immediate. If NewImm is a
+  /// non-NULL parameter, the last instruction is not emitted, but instead
+  /// its immediate operand is returned in NewImm.
+  unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         unsigned *NewImm) const;
+
 private:
   virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
 
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 043a1ef..d868f73 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -40,8 +40,8 @@
 using namespace llvm;
 
 MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST,
-                                       const TargetInstrInfo &TII)
-  : MipsRegisterInfo(ST, TII) {}
+                                       const MipsSEInstrInfo &I)
+  : MipsRegisterInfo(ST), TII(I) {}
 
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
@@ -122,15 +122,14 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     DebugLoc DL = II->getDebugLoc();
     unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
     unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT;
-    MipsAnalyzeImmediate::Inst LastInst(0, 0);
+    unsigned NewImm;
 
     MipsFI->setEmitNOAT();
-    Mips::loadImmediate(Offset, Subtarget.isABI_N64(), TII, MBB, II, DL, true,
-                        &LastInst);
-    BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg);
+    unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
+    BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(Reg);
 
     FrameReg = ATReg;
-    Offset = SignExtend64<16>(LastInst.ImmOpnd);
+    Offset = SignExtend64<16>(NewImm);
   }
 
   MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 4b17b33..b4eab65 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -18,11 +18,14 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
+class MipsSEInstrInfo;
 
 class MipsSERegisterInfo : public MipsRegisterInfo {
+  const MipsSEInstrInfo &TII;
+
 public:
   MipsSERegisterInfo(const MipsSubtarget &Subtarget,
-                     const TargetInstrInfo &TII);
+                     const MipsSEInstrInfo &TII);
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 11ff809..ac83d83 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -25,7 +25,8 @@ using namespace llvm;
 void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
-                             const std::string &FS, bool little) :
+                             const std::string &FS, bool little,
+                             Reloc::Model RM) :
   MipsGenSubtargetInfo(TT, CPU, FS),
   MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
@@ -54,6 +55,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
+
+  // Set UseSmallSection.
+  UseSmallSection = !IsLinux && (RM == Reloc::Static);
 }
 
 bool
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index ba15362..0595e8d 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -65,6 +65,9 @@ protected:
   // isLinux - Target system is Linux. Is false we consider ELFOS for now.
   bool IsLinux;
 
+  // UseSmallSection - Small section is used.
+  bool UseSmallSection;
+
   /// Features related to the presence of specific instructions.
 
   // HasSEInReg - SEB and SEH (signext in register) instructions.
@@ -109,7 +112,7 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   MipsSubtarget(const std::string &TT, const std::string &CPU,
-                const std::string &FS, bool little);
+                const std::string &FS, bool little, Reloc::Model RM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -133,6 +136,7 @@ public:
   bool inMips16Mode() const { return InMips16Mode; }
   bool isAndroid() const { return IsAndroid; }
   bool isLinux() const { return IsLinux; }
+  bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 2928a73..b70542b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -42,7 +42,7 @@ MipsTargetMachine(const Target &T, StringRef TT,
                   CodeGenOpt::Level OL,
                   bool isLittle)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, isLittle),
+    Subtarget(TT, CPU, FS, isLittle, RM),
     DataLayout(isLittle ?
                (Subtarget.isABI_N64() ?
                 "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 04dc60a..1f5e34f 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -26,6 +26,7 @@ SSThreshold("mips-ssection-threshold", cl::Hidden,
 
 void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection =
     getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
@@ -60,9 +61,10 @@ bool MipsTargetObjectFile::
 IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
                        SectionKind Kind) const {
 
-  // Only use small section for non linux targets.
   const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-  if (Subtarget.isLinux())
+
+  // Return if small section is not available.
+  if (!Subtarget.useSmallSection())
     return false;
 
   // Only global variables, not functions.
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index d175e3e..413142e 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -137,7 +137,7 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
 void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   char Value = MI->getOperand(OpNo).getImm();
-  Value = (Value << (32-5)) >> (32-5);
+  Value = SignExtend32<5>(Value);
   O << (int)Value;
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 245b457..b9ea8b5 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -64,7 +64,6 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
-  LCOMMDirectiveType = LCOMM::NoAlignment;
   AssemblerDialect = 0;           // Old-Style mnemonics.
 }
 
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index b7f1688..cb15dad 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -35,6 +35,10 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
 def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
 def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
 def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E500mc", "">;
+def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective", 
+                                       "PPC::DIR_E5500", "">;
 def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
 def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
 
@@ -94,6 +98,12 @@ def : Processor<"g5", G5Itineraries,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"e500mc", PPCE500mcModel,
+                  [DirectiveE500mc, FeatureMFOCRF,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+def : ProcessorModel<"e5500", PPCE5500Model,
+                  [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
 def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE,
                                          FeatureMFOCRF, FeatureFSqrt,
                                          FeatureSTFIWX, FeatureISEL,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f76b89c..6e0e8bb 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -109,6 +109,8 @@ namespace {
     bool doFinalization(Module &M);
 
     virtual void EmitFunctionEntryLabel();
+
+    void EmitFunctionBodyEnd();
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -345,23 +347,32 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer.EmitLabel(PICBase);
     return;
   }
+  case PPC::LDtocJTI:
+  case PPC::LDtocCPT:
   case PPC::LDtoc: {
     // Transform %X3 = LDtoc <ga:@min1>, %X2
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
-      
+
     // Change the opcode to LD, and the global address operand to be a
     // reference to the TOC entry we will synthesize later.
     TmpInst.setOpcode(PPC::LD);
     const MachineOperand &MO = MI->getOperand(1);
-    assert(MO.isGlobal());
-      
-    // Map symbol -> label of TOC entry.
-    MCSymbol *&TOCEntry = TOC[Mang->getSymbol(MO.getGlobal())];
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = 0;
+    if (MO.isGlobal())
+      MOSymbol = Mang->getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+    MCSymbol *&TOCEntry = TOC[MOSymbol];
     if (TOCEntry == 0)
       TOCEntry = GetTempSymbol("C", TOCLabelID++);
-      
+    
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
+      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC_ENTRY,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -406,9 +417,9 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
     OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName()));
   MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC.@tocbase"));
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
-                        Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/);
+                        8/*size*/, 0/*addrspace*/);
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2, OutContext),
-                        Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/);
+                        8/*size*/, 0/*addrspace*/);
   OutStreamer.SwitchSection(Current);
 
   MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
@@ -441,6 +452,23 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
   return AsmPrinter::doFinalization(M);
 }
 
+/// EmitFunctionBodyEnd - Print the traceback table before the .size
+/// directive.
+///
+void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+  // Only the 64-bit target requires a traceback table.  For now,
+  // we only emit the word of zeroes that GDB requires to find
+  // the end of the function, and zeroes for the eight-byte
+  // mandatory fields.
+  // FIXME: We should fill in the eight-byte mandatory fields as described in
+  // the PPC64 ELF ABI (this is a low-priority item because GDB does not
+  // currently make use of these fields).
+  if (Subtarget.isPPC64()) {
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+    OutStreamer.EmitIntValue(0, 8/*size*/);
+  }
+}
+
 void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   static const char *const CPUDirectives[] = {
     "",
@@ -453,6 +481,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppc750",
     "ppc970",
     "ppcA2",
+    "ppce500mc",
+    "ppce5500",
     "power6",
     "power7",
     "ppc64"
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index a00f686..e8f4d16 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -975,6 +975,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::AND: {
     unsigned Imm, Imm2, SH, MB, ME;
+    uint64_t Imm64;
 
     // If this is an and of a value rotated between 0 and 31 bits and then and'd
     // with a mask, emit rlwinm
@@ -993,6 +994,14 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
       return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
     }
+    // If this is a 64-bit zero-extension mask, emit rldicl.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 64 - CountTrailingOnes_64(Imm64);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB) };
+      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
+    }
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 61d44c5..dbb3b14 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -449,6 +449,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties();
+
+  // The Freescale cores does better with aggressive inlining of memcpy and
+  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+    maxStoresPerMemset = 32;
+    maxStoresPerMemsetOptSize = 16;
+    maxStoresPerMemcpy = 32;
+    maxStoresPerMemcpyOptSize = 8;
+    maxStoresPerMemmove = 32;
+    maxStoresPerMemmoveOptSize = 8;
+
+    setPrefFunctionAlignment(4);
+    benefitFromCodePlacementOpt = true;
+  }
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -517,6 +532,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::MTFSF:           return "PPCISD::MTFSF";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
   }
 }
 
@@ -811,14 +828,13 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   }
 
   // Properly sign extend the value.
-  int ShAmt = (4-ByteSize)*8;
-  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
 
   // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
   if (MaskVal == 0) return SDValue();
 
   // Finally, if this value fits in a 5 bit sext field, return it
-  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+  if (SignExtend32<5>(MaskVal) == MaskVal)
     return DAG.getTargetConstant(MaskVal, MVT::i32);
   return SDValue();
 }
@@ -1204,6 +1220,14 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   const Constant *C = CP->getConstVal();
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue CPIHi =
@@ -1217,6 +1241,14 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -1441,7 +1473,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                               MachinePointerInfo(),
                               MVT::i32, false, false, 0);
 
-  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), 
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
                      false, false, false, 0);
 }
 
@@ -2408,7 +2440,7 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
 
   int Addr = C->getZExtValue();
   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
-      (Addr << 6 >> 6) != Addr)
+      SignExtend32<26>(Addr) != Addr)
     return 0;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2,
@@ -2819,6 +2851,10 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
                                  isTailCall, RegsToPass, Ops, NodeTys,
                                  PPCSubTarget);
 
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCRegisterInfo::eliminateCallFramePseudoInstr.
@@ -3116,14 +3152,6 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Set CR6 to true if this is a vararg call with floating args passed in
-  // registers.
-  if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET,
-                                     dl, MVT::i32), 0);
-    RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
-  }
-
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
@@ -3133,6 +3161,18 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     InFlag = Chain.getValue(1);
   }
 
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
+  // registers.
+  if (isVarArg) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+
+    InFlag = Chain.getValue(1);
+  }
+
   if (isTailCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
                     false, TailCallArguments);
@@ -4126,7 +4166,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     unsigned TypeShiftAmt = i & (SplatBitSize-1);
 
     // vsplti + shl self.
-    if (SextVal == (i << (int)TypeShiftAmt)) {
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
@@ -4171,17 +4211,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     }
 
     // t = vsplti c, result = vsldoi t, t, 1
-    if (SextVal == ((i << 8) | (i < 0 ? 0xFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
-    if (SextVal == ((i << 16) | (i < 0 ? 0xFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
-    if (SextVal == ((i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
     }
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index b0a013b..902b188 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -174,6 +174,10 @@ namespace llvm {
       ///   operand #3 optional in flag
       TC_RETURN,
 
+      /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+      CR6SET,
+      CR6UNSET,
+
       /// STD_32 - This is the STD instruction for use with "32-bit" registers.
       STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
 
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 39778a5..cfe71d17 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -29,6 +29,9 @@ def symbolLo64 : Operand<i64> {
   let PrintMethod = "printSymbolLo";
   let EncoderMethod = "getLO16Encoding";
 }
+def tocentry : Operand<iPTR> {
+  let MIOperandInfo = (ops i32imm:$imm);
+}
 
 //===----------------------------------------------------------------------===//
 // 64-bit transformation functions.
@@ -296,12 +299,14 @@ def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
                       "li $rD, $imm", IntSimple,
                       [(set G8RC:$rD, immSExt16:$imm)]>;
 def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
                       "lis $rD, $imm", IntSimple,
                       [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
+}
 
 // Logical ops.
 def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
@@ -459,7 +464,7 @@ def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
 
 let Defs = [CARRY] in {
 def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
-                      "sradi $rA, $rS, $SH", IntRotateD,
+                      "sradi $rA, $rS, $SH", IntRotateDI,
                       [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
 }
 def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
@@ -482,7 +487,7 @@ def MULLD : XOForm_1<31, 233, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
 let isCommutable = 1 in {
 def RLDIMI : MDForm_1<30, 3,
                       (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldimi $rA, $rS, $SH, $MB", IntRotateD,
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateDI,
                       []>, isPPC64, RegConstraint<"$rSi = $rA">,
                       NoEncode<"$rSi">;
 }
@@ -494,11 +499,11 @@ def RLDCL  : MDForm_1<30, 0,
                       []>, isPPC64;
 def RLDICL : MDForm_1<30, 0,
                       (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldicl $rA, $rS, $SH, $MB", IntRotateD,
+                      "rldicl $rA, $rS, $SH, $MB", IntRotateDI,
                       []>, isPPC64;
 def RLDICR : MDForm_1<30, 1,
                       (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME),
-                      "rldicr $rA, $rS, $SH, $ME", IntRotateD,
+                      "rldicr $rA, $rS, $SH, $ME", IntRotateDI,
                       []>, isPPC64;
 
 def RLWINM8 : MForm_2<21,
@@ -541,19 +546,19 @@ def LWAX : XForm_1<31, 341, (outs G8RC:$rD), (ins memrr:$src),
 let mayLoad = 1 in
 def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp,
                             ptr_rc:$rA),
-                    "lhau $rD, $disp($rA)", LdStLoad,
+                    "lhau $rD, $disp($rA)", LdStLHAU,
                     []>, RegConstraint<"$rA = $ea_result">,
                     NoEncode<"$ea_result">;
 // NO LWAU!
 
 def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lhaux $rD, $addr", LdStLoad,
+                    "lhaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lwaux $rD, $addr", LdStLoad,
+                    "lwaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">, isPPC64;
 }
@@ -584,31 +589,31 @@ def LWZX8 : XForm_1<31,  23, (outs G8RC:$rD), (ins memrr:$src),
 // Update forms.
 let mayLoad = 1 in {
 def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lbzu $rD, $addr", LdStLoad,
+                    "lbzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lhzu $rD, $addr", LdStLoad,
+                    "lhzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lwzu $rD, $addr", LdStLoad,
+                    "lwzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 
 def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -624,6 +629,14 @@ def LDtoc: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
                   "",
                   [(set G8RC:$rD,
                      (PPCtoc_entry tglobaladdr:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tjumptable:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tconstpool:$disp, G8RC:$reg))]>, isPPC64;
 
 let hasSideEffects = 1 in { 
 let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
@@ -642,13 +655,13 @@ def LDX  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
                    
 let mayLoad = 1 in
 def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr),
-                    "ldu $rD, $addr", LdStLD,
+                    "ldu $rD, $addr", LdStLDU,
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
 def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "ldux $rD, $addr", LdStLoad,
+                   "ldux $rD, $addr", LdStLDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">, isPPC64;
 }
@@ -695,14 +708,14 @@ let PPC970_Unit = 2 in {
 
 def STBU8 : DForm_1a<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
@@ -710,7 +723,7 @@ def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
 
 def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg,
                                           iaddroff:$ptroff))]>,
@@ -718,7 +731,7 @@ def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
 
 def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         s16immX4:$ptroff, ptr_rc:$ptrreg),
-                    "stdu $rS, $ptroff($ptrreg)", LdStSTD,
+                    "stdu $rS, $ptroff($ptrreg)", LdStSTDU,
                     [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
@@ -727,7 +740,7 @@ def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
 
 def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti8 G8RC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -736,7 +749,7 @@ def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
 
 def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                    "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti16 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -745,7 +758,7 @@ def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
 
 def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti32 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -754,7 +767,7 @@ def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
 
 def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stdux $rS, $ptroff, $ptrreg", LdStSTDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 47f09dc..d2df664 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -54,7 +54,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
   const TargetMachine *TM,
   const ScheduleDAG *DAG) const {
   unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
-  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2) {
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+      Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
     const InstrItineraryData *II = TM->getInstrItineraryData();
     return new PPCScoreboardHazardRecognizer(II, DAG);
   }
@@ -70,7 +71,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
   // Most subtargets use a PPC970 recognizer.
-  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2) {
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+      Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
     const TargetInstrInfo *TII = TM.getInstrInfo();
     assert(TII && "No InstrInfo?");
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f57f0c9..a503908 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -123,9 +123,11 @@ def PPCnop : SDNode<"PPCISD::NOP", SDT_PPCnop, [SDNPInGlue, SDNPOutGlue]>;
 def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
-                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                          [SDNPHasChain, SDNPSideEffect,
+                           SDNPInGlue, SDNPOutGlue]>;
 def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
-                            [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                            [SDNPHasChain, SDNPSideEffect,
+                             SDNPInGlue, SDNPOutGlue]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl_Darwin  : SDNode<"PPCISD::BCTRL_Darwin", SDTNone,
@@ -153,6 +155,12 @@ def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore]>;
 
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 // Instructions to support atomic operations
 def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
                           [SDNPHasChain, SDNPMayLoad]>;
@@ -330,9 +338,6 @@ def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
   let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
   let EncoderMethod = "getMemRIXEncoding";
 }
-def tocentry : Operand<iPTR> {
-  let MIOperandInfo = (ops i32imm:$imm);
-}
 
 // PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
 // that doesn't matter.
@@ -673,7 +678,7 @@ def LWZ : DForm_1<32, (outs GPRC:$rD), (ins memri:$src),
                   [(set GPRC:$rD, (load iaddr:$src))]>;
 
 def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
-                  "lfs $rD, $src", LdStLFDU,
+                  "lfs $rD, $src", LdStLFD,
                   [(set F4RC:$rD, (load iaddr:$src))]>;
 def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
                   "lfd $rD, $src", LdStLFD,
@@ -683,32 +688,32 @@ def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
 // Unindexed (r+i) Loads with Update (preinc).
 let mayLoad = 1 in {
 def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lbzu $rD, $addr", LdStLoad,
+                   "lbzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhau $rD, $addr", LdStLoad,
+                   "lhau $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhzu $rD, $addr", LdStLoad,
+                   "lhzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lwzu $rD, $addr", LdStLoad,
+                   "lwzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfs $rD, $addr", LdStLFDU,
+                  "lfsu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfd $rD, $addr", LdStLFD,
+                  "lfdu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
@@ -716,37 +721,37 @@ def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
 // Indexed (r+r) Loads with Update (preinc).
 def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhaux $rD, $addr", LdStLoad,
+                   "lhaux $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfsux $rD, $addr", LdStLoad,
+                   "lfsux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfdux $rD, $addr", LdStLoad,
+                   "lfdux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -778,10 +783,10 @@ def LWBRX : XForm_1<31,  534, (outs GPRC:$rD), (ins memrr:$src),
                    [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
 def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
-                      "lfsx $frD, $src", LdStLFDU,
+                      "lfsx $frD, $src", LdStLFD,
                       [(set F4RC:$frD, (load xaddr:$src))]>;
 def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
-                      "lfdx $frD, $src", LdStLFDU,
+                      "lfdx $frD, $src", LdStLFD,
                       [(set F8RC:$frD, (load xaddr:$src))]>;
 }
 
@@ -801,10 +806,10 @@ def STW  : DForm_1<36, (outs), (ins GPRC:$rS, memri:$src),
                    "stw $rS, $src", LdStStore,
                    [(store GPRC:$rS, iaddr:$src)]>;
 def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
-                   "stfs $rS, $dst", LdStUX,
+                   "stfs $rS, $dst", LdStSTFD,
                    [(store F4RC:$rS, iaddr:$dst)]>;
 def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
-                   "stfd $rS, $dst", LdStUX,
+                   "stfd $rS, $dst", LdStSTFD,
                    [(store F8RC:$rS, iaddr:$dst)]>;
 }
 
@@ -812,33 +817,33 @@ def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
 let PPC970_Unit = 2 in {
 def STBU  : DForm_1a<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU  : DForm_1a<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STWU  : DForm_1a<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFSU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfsu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfsu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFDU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfdu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfdu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
@@ -863,7 +868,7 @@ def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
  
 def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti8 GPRC:$rS,
                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -872,7 +877,7 @@ def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
  
 def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                   "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti16 GPRC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -881,7 +886,7 @@ def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                  
 def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -889,7 +894,7 @@ def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
 
 def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
                               (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfsux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfsux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -897,7 +902,7 @@ def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
 
 def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res),
                               (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfdux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -913,14 +918,14 @@ def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst),
                    PPC970_DGroup_Cracked;
 
 def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfiwx $frS, $dst", LdStUX,
+                     "stfiwx $frS, $dst", LdStSTFD,
                      [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>;
                      
 def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst),
-                     "stfsx $frS, $dst", LdStUX,
+                     "stfsx $frS, $dst", LdStSTFD,
                      [(store F4RC:$frS, xaddr:$dst)]>;
 def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfdx $frS, $dst", LdStUX,
+                     "stfdx $frS, $dst", LdStSTFD,
                      [(store F8RC:$frS, xaddr:$dst)]>;
 }
 
@@ -964,7 +969,7 @@ def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
 }
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
   def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
                        "li $rD, $imm", IntSimple,
                        [(set GPRC:$rD, immSExt16:$imm)]>;
@@ -1143,6 +1148,16 @@ def CRUNSET: XLForm_1_ext<19, 193, (outs CRBITRC:$dst), (ins),
               "crxor $dst, $dst, $dst", BrCR,
               []>;
 
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
+              "creqv 6, 6, 6", BrCR,
+              [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+              "crxor 6, 6, 6", BrCR,
+              [(PPCcr6unset)]>;
+}
+
 // XFX-Form instructions.  Instructions that deal with SPRs.
 //
 let Uses = [CTR] in {
@@ -1233,7 +1248,7 @@ let Uses = [RM] in {
                PPC970_DGroup_Single, PPC970_Unit_FPU;
   def FADDrtz: AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (PPCfaddrtz F8RC:$FRA, F8RC:$FRB))]>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
@@ -1364,7 +1379,7 @@ def FSELS : AForm_1<63, 23,
 let Uses = [RM] in {
   def FADD  : AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>;
   def FADDS : AForm_2<59, 21,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
@@ -1388,7 +1403,7 @@ let Uses = [RM] in {
                       [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>;
   def FSUB  : AForm_2<63, 20,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fsub $FRT, $FRA, $FRB", FPGeneral,
+                      "fsub $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>;
   def FSUBS : AForm_2<59, 20,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 6a6ccb9..660c0c3 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -40,6 +40,7 @@ def IntMulHWU    : InstrItinClass;
 def IntMulLI     : InstrItinClass;
 def IntRFID      : InstrItinClass;
 def IntRotateD   : InstrItinClass;
+def IntRotateDI  : InstrItinClass;
 def IntRotate    : InstrItinClass;
 def IntShift     : InstrItinClass;
 def IntTrapD     : InstrItinClass;
@@ -52,15 +53,18 @@ def LdStDCBA     : InstrItinClass;
 def LdStDCBF     : InstrItinClass;
 def LdStDCBI     : InstrItinClass;
 def LdStLoad     : InstrItinClass;
+def LdStLoadUpd  : InstrItinClass;
 def LdStStore    : InstrItinClass;
+def LdStStoreUpd : InstrItinClass;
 def LdStDSS      : InstrItinClass;
 def LdStICBI     : InstrItinClass;
-def LdStUX       : InstrItinClass;
 def LdStLD       : InstrItinClass;
+def LdStLDU      : InstrItinClass;
 def LdStLDARX    : InstrItinClass;
 def LdStLFD      : InstrItinClass;
 def LdStLFDU     : InstrItinClass;
 def LdStLHA      : InstrItinClass;
+def LdStLHAU     : InstrItinClass;
 def LdStLMW      : InstrItinClass;
 def LdStLVecX    : InstrItinClass;
 def LdStLWA      : InstrItinClass;
@@ -69,6 +73,9 @@ def LdStSLBIA    : InstrItinClass;
 def LdStSLBIE    : InstrItinClass;
 def LdStSTD      : InstrItinClass;
 def LdStSTDCX    : InstrItinClass;
+def LdStSTDU     : InstrItinClass;
+def LdStSTFD     : InstrItinClass;
+def LdStSTFDU    : InstrItinClass;
 def LdStSTVEBX   : InstrItinClass;
 def LdStSTWCX    : InstrItinClass;
 def LdStSync     : InstrItinClass;
@@ -86,6 +93,7 @@ def SprMTSRIN    : InstrItinClass;
 def SprRFI       : InstrItinClass;
 def SprSC        : InstrItinClass;
 def FPGeneral    : InstrItinClass;
+def FPAddSub     : InstrItinClass;
 def FPCompare    : InstrItinClass;
 def FPDivD       : InstrItinClass;
 def FPDivS       : InstrItinClass;
@@ -110,6 +118,8 @@ include "PPCScheduleG4.td"
 include "PPCScheduleG4Plus.td"
 include "PPCScheduleG5.td"
 include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction to itinerary class map - When add new opcodes to the supported
@@ -171,7 +181,7 @@ include "PPCScheduleA2.td"
 //    extsh      IntSimple
 //    extsw      IntSimple
 //    fabs       FPGeneral
-//    fadd       FPGeneral
+//    fadd       FPAddSub
 //    fadds      FPGeneral
 //    fcfid      FPGeneral
 //    fcmpo      FPCompare
@@ -201,35 +211,35 @@ include "PPCScheduleA2.td"
 //    fsel       FPGeneral
 //    fsqrt      FPSqrt
 //    fsqrts     FPSqrt
-//    fsub       FPGeneral
+//    fsub       FPAddSub
 //    fsubs      FPGeneral
 //    icbi       LdStICBI
 //    isync      SprISYNC
 //    lbz        LdStLoad
-//    lbzu       LdStLoad
-//    lbzux      LdStUX
+//    lbzu       LdStLoadUpd
+//    lbzux      LdStLoadUpd
 //    lbzx       LdStLoad
 //    ld         LdStLD
 //    ldarx      LdStLDARX
-//    ldu        LdStLD
-//    ldux       LdStLD
+//    ldu        LdStLDU
+//    ldux       LdStLDU
 //    ldx        LdStLD
 //    lfd        LdStLFD
 //    lfdu       LdStLFDU
 //    lfdux      LdStLFDU
-//    lfdx       LdStLFDU
-//    lfs        LdStLFDU
+//    lfdx       LdStLFD
+//    lfs        LdStLFD
 //    lfsu       LdStLFDU
 //    lfsux      LdStLFDU
-//    lfsx       LdStLFDU
+//    lfsx       LdStLFD
 //    lha        LdStLHA
-//    lhau       LdStLHA
-//    lhaux      LdStLHA
+//    lhau       LdStLHAU
+//    lhaux      LdStLHAU
 //    lhax       LdStLHA
 //    lhbrx      LdStLoad
 //    lhz        LdStLoad
-//    lhzu       LdStLoad
-//    lhzux      LdStUX
+//    lhzu       LdStLoadUpd
+//    lhzux      LdStLoadUpd
 //    lhzx       LdStLoad
 //    lmw        LdStLMW
 //    lswi       LdStLMW
@@ -243,12 +253,12 @@ include "PPCScheduleA2.td"
 //    lvxl       LdStLVecX
 //    lwa        LdStLWA
 //    lwarx      LdStLWARX
-//    lwaux      LdStLHA
+//    lwaux      LdStLHAU
 //    lwax       LdStLHA
 //    lwbrx      LdStLoad
 //    lwz        LdStLoad
-//    lwzu       LdStLoad
-//    lwzux      LdStUX
+//    lwzu       LdStLoadUpd
+//    lwzux      LdStLoadUpd
 //    lwzx       LdStLoad
 //    mcrf       BrMCR
 //    mcrfs      FPGeneral
@@ -292,10 +302,10 @@ include "PPCScheduleA2.td"
 //    rfid       IntRFID
 //    rldcl      IntRotateD
 //    rldcr      IntRotateD
-//    rldic      IntRotateD
-//    rldicl     IntRotateD
-//    rldicr     IntRotateD
-//    rldimi     IntRotateD
+//    rldic      IntRotateDI
+//    rldicl     IntRotateDI
+//    rldicr     IntRotateDI
+//    rldimi     IntRotateDI
 //    rlwimi     IntRotate
 //    rlwinm     IntGeneral
 //    rlwnm      IntGeneral
@@ -305,33 +315,33 @@ include "PPCScheduleA2.td"
 //    sld        IntRotateD
 //    slw        IntGeneral
 //    srad       IntRotateD
-//    sradi      IntRotateD
+//    sradi      IntRotateDI
 //    sraw       IntShift
 //    srawi      IntShift
 //    srd        IntRotateD
 //    srw        IntGeneral
 //    stb        LdStStore
-//    stbu       LdStStore
-//    stbux      LdStStore
+//    stbu       LdStStoreUpd
+//    stbux      LdStStoreUpd
 //    stbx       LdStStore
 //    std        LdStSTD
 //    stdcx.     LdStSTDCX
-//    stdu       LdStSTD
-//    stdux      LdStSTD
+//    stdu       LdStSTDU
+//    stdux      LdStSTDU
 //    stdx       LdStSTD
-//    stfd       LdStUX
-//    stfdu      LdStUX
-//    stfdux     LdStUX
-//    stfdx      LdStUX
-//    stfiwx     LdStUX
-//    stfs       LdStUX
-//    stfsu      LdStUX
-//    stfsux     LdStUX
-//    stfsx      LdStUX
+//    stfd       LdStSTFD
+//    stfdu      LdStSTFDU
+//    stfdux     LdStSTFDU
+//    stfdx      LdStSTFD
+//    stfiwx     LdStSTFD
+//    stfs       LdStSTFD
+//    stfsu      LdStSTFDU
+//    stfsux     LdStSTFDU
+//    stfsx      LdStSTFD
 //    sth        LdStStore
 //    sthbrx     LdStStore
-//    sthu       LdStStore
-//    sthux      LdStStore
+//    sthu       LdStStoreUpd
+//    sthux      LdStStoreUpd
 //    sthx       LdStStore
 //    stmw       LdStLMW
 //    stswi      LdStLMW
@@ -344,8 +354,8 @@ include "PPCScheduleA2.td"
 //    stw        LdStStore
 //    stwbrx     LdStStore
 //    stwcx.     LdStSTWCX
-//    stwu       LdStStore
-//    stwux      LdStStore
+//    stwu       LdStStoreUpd
+//    stwux      LdStStoreUpd
 //    stwx       LdStStore
 //    subf       IntGeneral
 //    subfc      IntGeneral
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index cd0fb70..37b6eac 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -288,6 +288,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<2, [LWB]>],
                               [9, 5],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [9, 5],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStStore   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -297,6 +306,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStICBI    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -306,7 +324,7 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<1, [IFTH1, IFTH2]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
                                InstrStage<1, [LRACC]>,
@@ -315,6 +333,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5, 5],
                               [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -342,6 +369,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStLMW     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -371,6 +407,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1]>,
@@ -537,6 +582,19 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [FWB]>],
                               [10, 4, 4],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 4d4a5d0..ba63b5c 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -181,6 +181,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7],
                               [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<IntShift    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -302,7 +313,18 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLD      , [InstrStage<4,
+  InstrItinData<LdStLoadUpd , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -324,6 +346,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<LdStICBI    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -335,7 +368,7 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<4,
+  InstrItinData<LdStSTFD    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -346,6 +379,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7, 7],
                               [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -379,6 +423,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStLMW     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -412,6 +467,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -593,6 +659,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
                               [15, 7, 7],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
new file mode 100644
index 0000000..9bb779a
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -0,0 +1,265 @@
+//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500mc 32-bit 
+// Power processor.
+// 
+// All information is derived from the "e500mc Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500mc core:
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def DIS0 : FuncUnit; // Dispatch stage - insn 1
+def DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    Some instructions can only execute in SFX0 but not SFX1.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is executed.
+def SFX0  : FuncUnit; // Simple unit 0
+def SFX1  : FuncUnit; // Simple unit 1
+def BU    : FuncUnit; // Branch unit
+def CFX_DivBypass 
+          : FuncUnit; // CFX divide bypass path
+def CFX_0 : FuncUnit; // CFX pipeline
+def LSU_0 : FuncUnit; // LSU pipeline
+def FPU_0 : FuncUnit; // FPU pipeline
+
+def PPCE500mcItineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 1, 1], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<14, [CFX_DivBypass]>],
+                              [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11], // Latency = 8
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [5, 1], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1], // Latency = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [5, 1], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [5, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [SFX0]>],
+                              [8, 1],
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [4, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1], // Latency = 10, Repeat rate = 4 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<68, [FPU_0]>],
+                              [71, 1, 1], // Latency = 68, Repeat rate = 68
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500mc machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500mcModel : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE500mcItineraries;
+}
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
new file mode 100644
index 0000000..d7e11ac
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -0,0 +1,309 @@
+//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e5500 64-bit 
+// Power processor.
+// 
+// All information is derived from the "e5500 Core Reference Manual",
+// Freescale Document Number e5500RM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e5500 core
+// (These are the same as for the e500mc)
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+// def DIS0 : FuncUnit;
+// def DIS1 : FuncUnit;
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is being executed.
+// def SFX0  : FuncUnit; // Simple unit 0
+// def SFX1  : FuncUnit; // Simple unit 1
+// def BU    : FuncUnit; // Branch unit
+// def CFX_DivBypass 
+//           : FuncUnit; // CFX divide bypass path
+// def CFX_0 : FuncUnit; // CFX pipeline stage 0
+
+def CFX_1 : FuncUnit; // CFX pipeline stage 1 
+
+// def LSU_0 : FuncUnit; // LSU pipeline
+// def FPU_0 : FuncUnit; // FPU pipeline
+
+
+def PPCE5500Itineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
+   LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<26, [CFX_DivBypass]>],
+                              [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<16, [CFX_DivBypass]>],
+                              [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<7, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 7
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateD  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                                                            
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLDARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [LSU_0]>],
+                              [8, 2], // Latency = r+3, Repeat rate = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,                              
+  InstrItinData<LdStSTDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [CFX_0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [CFX_0]>],
+                              [9, 2], // Latency = 5, Repeat rate = 5
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [CFX_0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<31, [FPU_0]>],
+                              [39, 2, 2], // Latency = 35, Repeat rate = 31
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<16, [FPU_0]>],
+                              [24, 2, 2], // Latency = 20, Repeat rate = 16 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [12, 2], // Latency = 8, Repeat rate = 2
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE5500Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE5500Itineraries;
+}
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index 61e89ed..72a0a39 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -34,12 +34,16 @@ def G3Itineraries : ProcessorItineraries<
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
@@ -58,6 +62,7 @@ def G3Itineraries : ProcessorItineraries<
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index e19ddfa..fc9120d 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -33,13 +33,17 @@ def G4Itineraries : ProcessorItineraries<
   InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>, 
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
@@ -60,6 +64,7 @@ def G4Itineraries : ProcessorItineraries<
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index e7446cb..a4e82ce 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -36,19 +36,24 @@ def G4PlusItineraries : ProcessorItineraries<
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
@@ -66,6 +71,7 @@ def G4PlusItineraries : ProcessorItineraries<
   InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<5, [FPU1]>]>,  
   InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index 1371499..7c02ea0 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -27,6 +27,7 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
   InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotateDI , [InstrStage<2, [IU1, IU2]>]>,  
   InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
@@ -37,15 +38,20 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4, [SLU]>]>,  
   InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDU     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<5, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
@@ -53,6 +59,7 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
   InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
@@ -69,6 +76,7 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
   InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
   InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<6, [FPU1, FPU2]>]>,
   InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
   InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
   InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 0207c83..b8b1614 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -41,6 +41,8 @@ namespace PPC {
     DIR_750, 
     DIR_970, 
     DIR_A2,
+    DIR_E500mc,
+    DIR_E5500,
     DIR_PWR6,
     DIR_PWR7,
     DIR_64  
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 15541ef..e64c140 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -129,7 +129,7 @@ def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
-                           [SDNPHasChain]>;
+                           [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 8e215a7..62f973e 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -24,6 +24,16 @@ void TargetLibraryInfo::anchor() { }
 
 const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
   {
+    "_ZdaPv",
+    "_ZdlPv",
+    "_Znaj",
+    "_ZnajRKSt9nothrow_t",
+    "_Znam",
+    "_ZnamRKSt9nothrow_t",
+    "_Znwj",
+    "_ZnwjRKSt9nothrow_t",
+    "_Znwm",
+    "_ZnwmRKSt9nothrow_t",
     "__cxa_atexit",
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
@@ -31,16 +41,29 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__memcpy_chk",
     "acos",
     "acosf",
+    "acosh",
+    "acoshf",
+    "acoshl",
     "acosl",
     "asin",
     "asinf",
+    "asinh",
+    "asinhf",
+    "asinhl",
     "asinl",
     "atan",
     "atan2",
     "atan2f",
     "atan2l",
     "atanf",
+    "atanh",
+    "atanhf",
+    "atanhl",
     "atanl",
+    "calloc",
+    "cbrt",
+    "cbrtf",
+    "cbrtl",
     "ceil",
     "ceilf",
     "ceill",
@@ -54,6 +77,9 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "coshl",
     "cosl",
     "exp",
+    "exp10",
+    "exp10f",
+    "exp10l",
     "exp2",
     "exp2f",
     "exp2l",
@@ -74,6 +100,7 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "fmodl",
     "fputc",
     "fputs",
+    "free",
     "fwrite",
     "iprintf",
     "log",
@@ -86,8 +113,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "log2",
     "log2f",
     "log2l",
+    "logb",
+    "logbf",
+    "logbl",
     "logf",
     "logl",
+    "malloc",
     "memchr",
     "memcmp",
     "memcpy",
@@ -97,11 +128,14 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "nearbyint",
     "nearbyintf",
     "nearbyintl",
+    "posix_memalign",
     "pow",
     "powf",
     "powl",
     "putchar",
     "puts",
+    "realloc",
+    "reallocf",
     "rint",
     "rintf",
     "rintl",
@@ -121,10 +155,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "strcat",
     "strchr",
     "strcpy",
+    "strdup",
     "strlen",
     "strncat",
     "strncmp",
     "strncpy",
+    "strndup",
     "strnlen",
     "tan",
     "tanf",
@@ -134,7 +170,8 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "tanl",
     "trunc",
     "truncf",
-    "truncl"
+    "truncl",
+    "valloc"
   };
 
 /// initialize - Initialize the set of available library functions based on the
@@ -205,6 +242,21 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::tanhl);
 
     // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::acosh);
+    TLI.setUnavailable(LibFunc::acoshf);
+    TLI.setUnavailable(LibFunc::acoshl);
+    TLI.setUnavailable(LibFunc::asinh);
+    TLI.setUnavailable(LibFunc::asinhf);
+    TLI.setUnavailable(LibFunc::asinhl);
+    TLI.setUnavailable(LibFunc::atanh);
+    TLI.setUnavailable(LibFunc::atanhf);
+    TLI.setUnavailable(LibFunc::atanhl);
+    TLI.setUnavailable(LibFunc::cbrt);
+    TLI.setUnavailable(LibFunc::cbrtf);
+    TLI.setUnavailable(LibFunc::cbrtl);
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
     TLI.setUnavailable(LibFunc::exp2);
     TLI.setUnavailable(LibFunc::exp2f);
     TLI.setUnavailable(LibFunc::exp2l);
@@ -217,6 +269,9 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::log1p);
     TLI.setUnavailable(LibFunc::log1pf);
     TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::logb);
+    TLI.setUnavailable(LibFunc::logbf);
+    TLI.setUnavailable(LibFunc::logbl);
     TLI.setUnavailable(LibFunc::nearbyint);
     TLI.setUnavailable(LibFunc::nearbyintf);
     TLI.setUnavailable(LibFunc::nearbyintl);
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 73a0095..c89e738 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -67,12 +67,19 @@ private:
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
 
-  bool MatchInstruction(SMLoc IDLoc,
+  bool MatchInstruction(SMLoc IDLoc,  unsigned &Kind,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         SmallVectorImpl<MCInst> &MCInsts,
                         unsigned &OrigErrorInfo,
                         bool matchingInlineAsm = false);
 
+  unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               unsigned OperandNum, unsigned &NumMCOperands) {
+    return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum,
+                                   NumMCOperands);
+  }
+
   /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
   /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
   bool isSrcOp(X86Operand &Op);
@@ -514,12 +521,13 @@ bool X86AsmParser::isDstOp(X86Operand &Op) {
 bool X86AsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
   RegNo = 0;
-  if (!isParsingIntelSyntax()) {
-    const AsmToken &TokPercent = Parser.getTok();
-    assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
-    StartLoc = TokPercent.getLoc();
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
     Parser.Lex(); // Eat percent token.
-  }
 
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -1516,9 +1524,12 @@ bool X86AsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
-  SmallVector<MCInst, 2> Insts;
+  unsigned Kind;
   unsigned ErrorInfo;
-  bool Error = MatchInstruction(IDLoc, Operands, Insts, ErrorInfo);
+  SmallVector<MCInst, 2> Insts;
+
+  bool Error = MatchInstruction(IDLoc, Kind, Operands, Insts,
+                                ErrorInfo);
   if (!Error)
     for (unsigned i = 0, e = Insts.size(); i != e; ++i)
       Out.EmitInstruction(Insts[i]);
@@ -1526,7 +1537,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
 }
 
 bool X86AsmParser::
-MatchInstruction(SMLoc IDLoc,
+MatchInstruction(SMLoc IDLoc, unsigned &Kind,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                  SmallVectorImpl<MCInst> &MCInsts, unsigned &OrigErrorInfo,
                  bool matchingInlineAsm) {
@@ -1537,7 +1548,7 @@ MatchInstruction(SMLoc IDLoc,
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
-  // Also, MatchInstructionImpl should do actually *do* the EmitInstruction
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
   // call.
   if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
       Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
@@ -1568,7 +1579,7 @@ MatchInstruction(SMLoc IDLoc,
   MCInst Inst;
 
   // First, try a direct match.
-  switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo,
+  switch (MatchInstructionImpl(Operands, Kind, Inst, OrigErrorInfo,
                                isParsingIntelSyntax())) {
   default: break;
   case Match_Success:
@@ -1585,9 +1596,6 @@ MatchInstruction(SMLoc IDLoc,
     Error(IDLoc, "instruction requires a CPU feature not currently enabled",
           EmptyRanges, matchingInlineAsm);
     return true;
-  case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction",
-                 EmptyRanges, matchingInlineAsm);
   case Match_InvalidOperand:
     WasOriginallyInvalidOperand = true;
     break;
@@ -1619,14 +1627,19 @@ MatchInstruction(SMLoc IDLoc,
   Tmp[Base.size()] = Suffixes[0];
   unsigned ErrorInfoIgnore;
   unsigned Match1, Match2, Match3, Match4;
+  unsigned tKind;
 
-  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match1 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match1 == Match_Success) Kind = tKind;
   Tmp[Base.size()] = Suffixes[1];
-  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match2 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match2 == Match_Success) Kind = tKind;
   Tmp[Base.size()] = Suffixes[2];
-  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match3 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match3 == Match_Success) Kind = tKind;
   Tmp[Base.size()] = Suffixes[3];
-  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match4 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match4 == Match_Success) Kind = tKind;
 
   // Restore the old token.
   Op->setTokenValue(Base);
@@ -1677,8 +1690,10 @@ MatchInstruction(SMLoc IDLoc,
   if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
       (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
     if (!WasOriginallyInvalidOperand) {
+      ArrayRef<SMRange> Ranges = matchingInlineAsm ? EmptyRanges :
+        Op->getLocRange();
       return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
-                   Op->getLocRange(), matchingInlineAsm);
+                   Ranges, matchingInlineAsm);
     }
 
     // Recover location info for the operand if we know which was the problem.
@@ -1730,7 +1745,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return ParseDirectiveWord(2, DirectiveID.getLoc());
   else if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
-  else if (IDVal.startswith(".intel_syntax")) {
+  else if (IDVal.startswith(".att_syntax")) {
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if(Parser.getTok().getString() == "noprefix") {
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 5039887..f136927 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -44,7 +44,7 @@ void x86DisassemblerDebug(const char *file,
   dbgs() << file << ":" << line << ": " << s;
 }
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) {
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
   const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
   return MII->getName(Opcode);
 }
@@ -95,8 +95,8 @@ const EDInstInfo *X86GenericDisassembler::getEDInfo() const {
 ///                   be a pointer to a MemoryObject.
 /// @param byte     - A pointer to the byte to be read.
 /// @param address  - The address to be read.
-static int regionReader(void* arg, uint8_t* byte, uint64_t address) {
-  MemoryObject* region = static_cast<MemoryObject*>(arg);
+static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
+  const MemoryObject* region = static_cast<const MemoryObject*>(arg);
   return region->readByte(address, byte);
 }
 
@@ -135,10 +135,10 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
-                              (void*)&region,
+                              (const void*)&region,
                               loggerFn,
                               (void*)&vStream,
-                              (void*)MII,
+                              (const void*)MII,
                               address,
                               fMode);
 
@@ -379,6 +379,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   }
 
   switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
   case TYPE_XMM128:
     mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
     return;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index 0c92912..af444d1 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -200,7 +200,7 @@ static void unconsumeByte(struct InternalInstruction* insn) {
                              insn->readerCursor + offset);        \
       if (ret)                                                    \
         return ret;                                               \
-      combined = combined | ((type)byte << ((type)offset * 8));   \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
     }                                                             \
     *ptr = combined;                                              \
     insn->readerCursor += sizeof(type);                           \
@@ -719,7 +719,7 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
  * @return      - 0 if the ModR/M could be read when needed or was not needed;
  *                nonzero otherwise.
  */
-static int getID(struct InternalInstruction* insn, void *miiArg) {
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
   uint8_t attrMask;
   uint16_t instructionID;
   
@@ -1621,10 +1621,10 @@ static int readOperands(struct InternalInstruction* insn) {
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode) {
   memset(insn, 0, sizeof(struct InternalInstruction));
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 797703f..05cbb4c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -403,7 +403,7 @@ typedef uint8_t BOOL;
  *                  be read from.
  * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
  */
-typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address);
+typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
 
 /*
  * dlog_t - Type for the logging function that the consumer can provide to
@@ -422,7 +422,7 @@ struct InternalInstruction {
   /* Reader interface (C) */
   byteReader_t reader;
   /* Opaque value passed to the reader */
-  void* readerArg;
+  const void* readerArg;
   /* The address of the next byte to read via the reader */
   uint64_t readerCursor;
 
@@ -561,10 +561,10 @@ struct InternalInstruction {
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode);
 
@@ -579,7 +579,7 @@ void x86DisassemblerDebug(const char *file,
                           unsigned line,
                           const char *s);
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii);
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
 
 #ifdef __cplusplus
 }
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 624e56f..4011035 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -941,3 +941,15 @@ and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
 cost of reduced accuracy.
 
 //===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+  return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 18e6b7c..d078a7b 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -120,6 +120,9 @@ def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+                                     "HasSlowDivide", "true",
+                                     "Use small divide for positive values less than 256">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -160,7 +163,8 @@ def : Proc<"core2",           [FeatureSSSE3, FeatureCMPXCHG16B,
 def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B,
-                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>;
+                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+                               FeatureSlowDivide]>;
 // "Arrandale" along with corei3 and corei5
 def : Proc<"corei7",          [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureSlowBTMem, FeatureFastUAMem,
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index db71e27..a4785c9 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -233,12 +233,14 @@ void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo,
 
 
 void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
-                                 raw_ostream &O, const char *Modifier) {
+                                 raw_ostream &O, const char *Modifier,
+                                 unsigned AsmVariant) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
   default: llvm_unreachable("unknown operand type!");
   case MachineOperand::MO_Register: {
-    O << '%';
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
     unsigned Reg = MO.getReg();
     if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
       EVT VT = (strcmp(Modifier+6,"64") == 0) ?
@@ -471,7 +473,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     }
   }
 
-  printOperand(MI, OpNo, O);
+  printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
   return false;
 }
 
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 35386cd..0062387 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -50,7 +50,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
 
   // These methods are used by the tablegen'erated instruction printer.
   void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = 0, unsigned AsmVariant = 0);
   void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
 
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index d705049..e202321 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Function.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
@@ -134,8 +133,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
 
   do {
-    DEBUG(dbgs() << "JITTing function '"
-          << MF.getFunction()->getName() << "'\n");
+    DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index e5952aa..54704d8 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -2014,13 +2014,17 @@ X86FastISel::TargetSelectInstruction(const Instruction *I)  {
 unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   MVT VT;
   if (!isTypeLegal(C->getType(), VT))
-    return false;
+    return 0;
+
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
   switch (VT.SimpleTy) {
-  default: return false;
+  default: return 0;
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
@@ -2058,7 +2062,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     break;
   case MVT::f80:
     // No f80 support yet.
-    return false;
+    return 0;
   }
 
   // Materialize addresses with LEA instructions.
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 955c75a..9d5de81 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -171,6 +171,7 @@ namespace {
     // Shuffle live registers to match the expectations of successor blocks.
     void finishBlockStack();
 
+#ifndef NDEBUG
     void dumpStack() const {
       dbgs() << "Stack contents:";
       for (unsigned i = 0; i != StackTop; ++i) {
@@ -181,6 +182,7 @@ namespace {
         dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
       dbgs() << "\n";
     }
+#endif
 
     /// getSlot - Return the stack slot number a particular register number is
     /// in.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 27195b4..5fdc61e 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -100,6 +100,7 @@ namespace {
       Base_Reg = Reg;
     }
 
+#ifndef NDEBUG
     void dump() {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
@@ -133,6 +134,7 @@ namespace {
         dbgs() << "nul";
       dbgs() << " JT" << JT << " Align" << Align << '\n';
     }
+#endif
   };
 }
 
@@ -1011,7 +1013,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
           AM.IndexReg = ShVal.getNode()->getOperand(0);
           ConstantSDNode *AddVal =
             cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
-          uint64_t Disp = AddVal->getSExtValue() << Val;
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!FoldOffsetIntoAddress(Disp, AM))
             return false;
         }
@@ -2116,7 +2118,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     // Make sure that we don't change the operation by removing bits.
     // This only matters for OR and XOR, AND is unaffected.
-    if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
       break;
 
     unsigned ShlOp, Op;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7954170..5c525ae 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -85,7 +85,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                VecIdx);
 
@@ -118,7 +118,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                      VecIdx);
 }
@@ -182,6 +182,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(X86StackPtr);
 
+  // Bypass i32 with i8 on Atom when compiling with O2
+  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+    addBypassSlowDivType(Type::getInt32Ty(getGlobalContext()), Type::getInt8Ty(getGlobalContext()));
+
   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
     // Setup Windows compiler runtime calls.
     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
@@ -735,6 +739,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
@@ -824,6 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
@@ -857,6 +863,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
@@ -925,6 +932,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
   }
 
   if (Subtarget->hasSSE41()) {
@@ -939,6 +948,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
 
+    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -1016,19 +1028,25 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
 
     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
+
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
 
@@ -1052,7 +1070,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
 
-    if (Subtarget->hasFMA()) {
+    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
@@ -2832,7 +2850,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII =
-        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3506,25 +3524,26 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
       MatchOddMask = false;
   }
-  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
-  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
 
-  const int *CompactionMask;
-  if (MatchEvenMask)
-    CompactionMask = CompactionMaskEven;
-  else if (MatchOddMask)
-    CompactionMask = CompactionMaskOdd;
-  else
+  if (!MatchEvenMask && !MatchOddMask)
     return SDValue();
-
+  
   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
 
-  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
-                                     UndefNode, CompactionMask);
-  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
-                                     UndefNode, CompactionMask);
-  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
-  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+  SDValue Op0 = SVOp->getOperand(0);
+  SDValue Op1 = SVOp->getOperand(1);
+
+  if (MatchEvenMask) {
+    // Shift the second operand right to 32 bits.
+    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+  } else {
+    // Shift the first operand left to 32 bits.
+    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+  }
+  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
 }
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -4977,6 +4996,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                                 LDBase->getAlignment(),
                                 false/*isVolatile*/, true/*ReadMem*/,
                                 false/*WriteMem*/);
+
+    // Make sure the newly-created LOAD is in the same position as LDBase in
+    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+    // update uses of LDBase's output chain to use the TokenFactor.
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(ResNode.getNode(), 1));
+    }
+
     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   }
   return SDValue();
@@ -5881,8 +5912,6 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   DebugLoc dl = SVOp->getDebugLoc();
   ArrayRef<int> MaskVals = SVOp->getMask();
 
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
@@ -5906,7 +5935,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
-    if (V2IsUndef)
+
+    // As PSHUFB will zero elements with negative indices, it's safe to ignore
+    // the 2nd operand if it's undefined or zero.
+    if (V2.getOpcode() == ISD::UNDEF ||
+        ISD::isBuildVectorAllZeros(V2.getNode()))
       return V1;
 
     // Calculate the shuffle mask for the second input, shuffle it, and
@@ -5992,6 +6025,51 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
 }
 
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG,
+                                 const X86TargetLowering &TLI) {
+  EVT VT = SVOp->getValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // VPSHUFB may be generated if 
+  // (1) one of input vector is undefined or zeroinitializer.
+  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+  // And (2) the mask indexes don't cross the 128-bit lane.
+  if (VT != MVT::v32i8 || !TLI.getSubtarget()->hasAVX2() ||
+      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+    return SDValue();
+
+  if (V1IsAllZero && !V2IsAllZero) {
+    CommuteVectorShuffleMask(MaskVals, 32);
+    V1 = V2;
+  }
+  SmallVector<SDValue, 32> pshufbMask;
+  for (unsigned i = 0; i != 32; i++) {
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0 || EltIdx >= 32)
+      EltIdx = 0x80;
+    else {
+      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
+        // Cross lane is not allowed.
+        return SDValue();
+      EltIdx &= 0xf;
+    }
+    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+  }
+  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                  MVT::v32i8, &pshufbMask[0], 32));
+}
+
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
 /// done when every pair / quad of shuffle mask elements point to elements in
@@ -6818,6 +6896,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return NewOp;
   }
 
+  if (VT == MVT::v32i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, DAG, *this);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
   // Handle all 128-bit wide vectors with 4 elements, and match them with
   // several different shuffle types.
   if (NumElems == 4 && VT.is128BitVector())
@@ -8115,26 +8199,35 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   return FIST;
 }
 
-SDValue X86TargetLowering::LowerFABS(SDValue Op,
-                                     SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT EltVT = VT;
-  if (VT.isVector())
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
-  Constant *C;
-  if (EltVT == MVT::f64) {
-    C = ConstantVector::getSplat(2,
-                ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
-  } else {
-    C = ConstantVector::getSplat(4,
-               ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+    NumElts = VT.getVectorNumElements();
   }
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+  else
+    C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+  C = ConstantVector::getSplat(NumElts, C);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::AND, dl, ANDVT,
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+  }
   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
 }
 
@@ -8154,10 +8247,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   else
     C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
   C = ConstantVector::getSplat(NumElts, C);
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
   if (VT.isVector()) {
     MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
@@ -9943,62 +10037,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                Op.getOperand(1), Op.getOperand(2), DAG);
   }
 
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = Op.getOperand(2);
-    if (isa<ConstantSDNode>(ShAmt))
-      return SDValue();
-
-    unsigned NewIntNo;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntNo = Intrinsic::x86_mmx_psll_w;
-      break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntNo = Intrinsic::x86_mmx_psll_d;
-      break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntNo = Intrinsic::x86_mmx_psll_q;
-      break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntNo = Intrinsic::x86_mmx_psrl_w;
-      break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntNo = Intrinsic::x86_mmx_psrl_d;
-      break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntNo = Intrinsic::x86_mmx_psrl_q;
-      break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntNo = Intrinsic::x86_mmx_psra_w;
-      break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntNo = Intrinsic::x86_mmx_psra_d;
-      break;
-    }
-
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
-                         DAG.getConstant(0, MVT::i32));
-// FIXME this must be lowered to get rid of the invalid type.
-
-    EVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(NewIntNo, MVT::i32),
-                       Op.getOperand(1), ShAmt);
-  }
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -10077,6 +10115,74 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
   }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+    unsigned Opc;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+      Opc = X86ISD::FMSUBADD;
+      break;
+    }
+
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   }
 }
 
@@ -10918,7 +11024,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
 
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
       }
       // fall through
     case MVT::v4i32:
@@ -14020,7 +14126,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 //
 // where Op could be BRCOND or CMOV.
 //
-static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   // Quit if not CMP and SUB with its value result used.
   if (Cmp.getOpcode() != X86ISD::CMP &&
       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
@@ -14056,40 +14162,133 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
     SetCC = SetCC.getOperand(0);
 
-  // Quit if not SETCC.
-  // FIXME: So far we only handle the boolean value generated from SETCC. If
-  // there is other ways to generate boolean values, we need handle them here
-  // as well.
-  if (SetCC.getOpcode() != X86ISD::SETCC)
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      // A special case for rdrand, where 0 is set if false cond is found.
+      SDValue Op = SetCC.getOperand(0);
+      if (Op.getOpcode() != X86ISD::RDRAND)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
+  }
+
+  return SDValue();
+}
+
+/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
+/// updated. If only flag result is used and the result is evaluated from a
+/// series of element extraction, try to combine it into a PTEST.
+static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDNode *N = Or.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  // Only SSE4.1 and beyond supports PTEST or like.
+  if (!Subtarget->hasSSE41())
     return SDValue();
 
-  // Set the condition code or opposite one if necessary.
-  CC = X86::CondCode(SetCC.getConstantOperandVal(0));
-  if (needOppositeCond)
-    CC = X86::GetOppositeBranchCondition(CC);
+  if (N->getOpcode() != X86ISD::OR)
+    return SDValue();
 
-  return SetCC.getOperand(1);
-}
+  // Quit if the value result of OR is used.
+  if (N->hasAnyUseOfValue(0))
+    return SDValue();
 
-static bool IsValidFCMOVCondition(X86::CondCode CC) {
-  switch (CC) {
-  default:
-    return false;
-  case X86::COND_B:
-  case X86::COND_BE:
-  case X86::COND_E:
-  case X86::COND_P:
-  case X86::COND_AE:
-  case X86::COND_A:
-  case X86::COND_NE:
-  case X86::COND_NP:
-    return true;
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SmallVector<SDValue, 8> Opnds;
+  SDValue VecIn;
+  EVT VT = MVT::Other;
+  unsigned Mask = 0;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    // Check if all elements are extracted from the same vector.
+    SDValue ExtractedFromVec = I->getOperand(0);
+    if (VecIn.getNode() == 0) {
+      VT = ExtractedFromVec.getValueType();
+      // FIXME: only 128-bit vector is supported so far.
+      if (!VT.is128BitVector())
+        return SDValue();
+      VecIn = ExtractedFromVec;
+    } else if (VecIn != ExtractedFromVec)
+      return SDValue();
+
+    // Record the constant index.
+    Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   }
+
+  assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
+
+  // Quit if not all elements are used.
+  if (Mask != (1U << VT.getVectorNumElements()) - 1U)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
 }
 
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
 
   // If the flag operand isn't dead, don't touch this CMOV.
@@ -14114,10 +14313,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(Cond, CC);
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
   if (Flags.getNode() &&
       // Extra check as FCMOV only supports a subset of X86 cond.
-      (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
+      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
+  Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
+  if (Flags.getNode()) {
     SDValue Ops[] = { FalseOp, TrueOp,
                       DAG.getConstant(CC, MVT::i8), Flags };
     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
@@ -15384,7 +15591,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
     return SDValue();
 
   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
-  // into FMINC and MMAXC, which are Commutative operations.
+  // into FMINC and FMAXC, which are Commutative operations.
   unsigned NewOp = 0;
   switch (N->getOpcode()) {
     default: llvm_unreachable("unknown opcode");
@@ -15502,8 +15709,13 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
     return SDValue();
 
   SDValue A = N->getOperand(0);
@@ -15525,9 +15737,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
 
   unsigned Opcode;
   if (!NegMul)
-    Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   else
-    Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
   return DAG.getNode(Opcode, dl, VT, A, B, C);
 }
 
@@ -15625,7 +15838,9 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
@@ -15641,7 +15856,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
   if (Flags.getNode()) {
     SDValue Cond = DAG.getConstant(CC, MVT::i8);
     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
@@ -15663,7 +15884,14 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
   if (Flags.getNode()) {
     SDValue Cond = DAG.getConstant(CC, MVT::i8);
     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
@@ -15858,7 +16086,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
@@ -15888,7 +16116,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
-  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index b0c27c8..bfe9541 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -16,15 +16,18 @@
 //
 
 // Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP in {
-  def RET    : I   <0xC3, RawFrm, (outs), (ins),
+  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
                     [(X86retflag 0)], IIC_RET>;
   def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
                     [], IIC_RET>, OpSize;
-  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret\t$amt",
                     [(X86retflag timm:$amt)], IIC_RET_IMM>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 95ee7e5..5663800 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -19,7 +19,8 @@ let Constraints = "$src1 = $dst" in {
 multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                     PatFrag MemFrag128, PatFrag MemFrag256,
                     ValueType OpVT128, ValueType OpVT256,
-                    SDPatternOperator Op = null_frag, bit MayLoad = 1> {
+                    SDPatternOperator Op = null_frag> {
+  let isCommutable = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -27,7 +28,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
                                                VR128:$src1, VR128:$src3)))]>;
 
-  let mayLoad = MayLoad in
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, f128mem:$src3),
                    !strconcat(OpcodeStr,
@@ -35,6 +36,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
+  let isCommutable = 1 in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
                    !strconcat(OpcodeStr,
@@ -42,7 +44,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
                                                VR256:$src3)))]>;
 
-  let mayLoad = MayLoad in
+  let mayLoad = 1 in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
                    !strconcat(OpcodeStr,
@@ -59,7 +61,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
   defm r213 : fma3p_rm<opc213,
                        !strconcat(OpcodeStr, !strconcat("213", PackTy)),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op, 0>;
+                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
 let neverHasSideEffects = 1 in {
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, !strconcat("132", PackTy)),
@@ -112,148 +114,18 @@ let ExeDomain = SSEPackedDouble in {
                                v4f64>, VEX_W;
 }
 
-let Predicates = [HasFMA] in {
-  def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMADDSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMSUBADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMADDSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMSUBADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMADDSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMSUBADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMADDSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMSUBADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFNMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFNMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFNMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFNMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFNMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFNMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFNMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFNMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-} // Predicates = [HasFMA]
-
 let Constraints = "$src1 = $dst" in {
 multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                     RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
-                    SDPatternOperator OpNode = null_frag, bit MayLoad = 1> {
+                    SDPatternOperator OpNode = null_frag> {
+  let isCommutable = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
-  let mayLoad = MayLoad in
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
@@ -266,6 +138,7 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
 multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
                         ComplexPattern mem_cpat, Intrinsic IntId,
                         RegisterClass RC> {
+  let isCommutable = 1 in
   def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -294,7 +167,7 @@ let neverHasSideEffects = 1 in {
 }
 
 defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
-                     x86memop, RC, OpVT, mem_frag, OpNode, 0>,
+                     x86memop, RC, OpVT, mem_frag, OpNode>,
             fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
                          memop, mem_cpat, Int, RC>;
 }
@@ -324,73 +197,102 @@ defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
 //===----------------------------------------------------------------------===//
 
 
-multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop,
-                 ComplexPattern mem_cpat, Intrinsic Int> {
-  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
-  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, memop:$src3),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4;
-  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, memop:$src2, VR128:$src3),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
 // For disassembler
 let isCodeGenOnly = 1 in
-  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
 }
 
-multiclass fma4p<bits<8> opc, string OpcodeStr,
-                 Intrinsic Int128, Intrinsic Int256,
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+  let isCommutable = 1 in
+  def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+  def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
                  PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
   def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W, MemOp4;
   def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, f128mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2,
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
                               (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
   def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, f128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
   def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
-             (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4;
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, MemOp4;
   def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, f256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2,
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
                               (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4;
   def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
-             (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
+             (OpNode VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
 // For disassembler
 let isCodeGenOnly = 1 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
@@ -406,45 +308,58 @@ let isCodeGenOnly = 1 in {
 
 let Predicates = [HasFMA4] in {
 
-defm VFMADDSS4    : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmadd_sd>;
-defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps,
-                          int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd,
-                          int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBSS4    : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4    : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmsub_sd>;
-defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps,
-                          int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd,
-                          int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFNMADDSS4   : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4   : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmadd_sd>;
-defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps,
-                          int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd,
-                          int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFNMSUBSS4   : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4   : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmsub_sd>;
-defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps,
-                          int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd,
-                          int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps,
-                          int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd,
-                          int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps,
-                          int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd,
-                          int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmadd_ss>;
+defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmadd_sd>;
+defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmsub_ss>;
+defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmsub_sd>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                        X86Fnmadd, loadf32>,
+                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                        X86Fnmadd, loadf64>,
+                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmadd_sd>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                        X86Fnmsub, loadf32>,
+                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                        X86Fnmsub, loadf64>,
+                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmsub_sd>;
+
+defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
 } // HasFMA4
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 81b4f81..55ad2ec 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -287,12 +287,14 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
+def __xs : XS;
+
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -303,7 +305,7 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -314,18 +316,25 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
 }
 
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]);
+}
+
 // PIi8 - SSE 1 & 2 packed instructions with immediate
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
@@ -341,18 +350,18 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@@ -372,27 +381,31 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
 //   VSDI   - SSE2 instructions with XD prefix in AVX form.
 //   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
 class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
@@ -405,6 +418,12 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
         OpSize, Requires<[HasAVX]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 
 // SSE3 Instruction Templates:
 // 
@@ -415,21 +434,23 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 
 
 // SSSE3 Instruction Templates:
 // 
 //   SS38I - SSSE3 instructions with T8 prefix.
 //   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
 //
 // Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
 // uses the MMX registers. The 64-bit versions are grouped with the MMX
@@ -438,10 +459,18 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSSE3]>;
+        Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
         Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -452,11 +481,11 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
 // 
@@ -464,9 +493,10 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
 class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
@@ -475,7 +505,7 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index ee2d3c4..9035435 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -183,8 +183,8 @@ def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
 def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
-def X86Fmaddsub  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
-def X86Fmsubadd  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
 
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -240,6 +240,10 @@ def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
+// 128-/256-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 459f01a..4f3d824 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1110,6 +1110,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      TB_ALIGN_32 },
     { X86::VPXORYrr,          X86::VPXORYrm,           TB_ALIGN_32 },
     // FIXME: add AVX 256-bit foldable instructions
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1237,6 +1267,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1786,10 +1846,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
-  unsigned Dest = MI->getOperand(0).getReg();
-  unsigned Src = MI->getOperand(1).getReg();
-  bool isDead = MI->getOperand(0).isDead();
-  bool isKill = MI->getOperand(1).isKill();
+  const MachineOperand &Dest = MI->getOperand(0);
+  const MachineOperand &Src = MI->getOperand(1);
 
   MachineInstr *NewMI = NULL;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
@@ -1807,11 +1865,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHUFPDrri: {
@@ -1821,15 +1877,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
 
     // Convert to PSHUFD mask.
     M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHL64ri: {
@@ -1840,15 +1894,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle RSP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR64_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
       return 0;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL32ri: {
@@ -1859,15 +1912,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle ESP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR32_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR32_NOSPRegClass))
       return 0;
 
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL16ri: {
@@ -1880,10 +1933,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   default: {
@@ -1906,14 +1957,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
 
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     }
     case X86::INC16r:
@@ -1921,10 +1970,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     case X86::DEC64r:
     case X86::DEC32r:
@@ -1936,14 +1983,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     }
     case X86::DEC16r:
@@ -1951,10 +1996,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     case X86::ADD64rr:
     case X86::ADD64rr_DB:
@@ -1981,9 +2024,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         return 0;
 
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
 
       // Preserve undefness of the operands.
       bool isUndef = MI->getOperand(1).isUndef();
@@ -2003,9 +2045,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       unsigned Src2 = MI->getOperand(2).getReg();
       bool isKill2 = MI->getOperand(2).isKill();
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
       if (LV && isKill2)
         LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
@@ -2015,10 +2063,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD64ri32_DB:
     case X86::ADD64ri8_DB:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     case X86::ADD32ri:
     case X86::ADD32ri8:
@@ -2026,10 +2073,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD32ri8_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                                Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
     case X86::ADD16ri:
@@ -2039,10 +2085,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
   }
@@ -2051,10 +2096,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   if (!NewMI) return 0;
 
   if (LV) {  // Update live variables
-    if (isKill)
-      LV->replaceKillInstruction(Src, MI, NewMI);
-    if (isDead)
-      LV->replaceKillInstruction(Dest, MI, NewMI);
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
   }
 
   MFI->insert(MBBI, NewMI);          // Insert the new inst
@@ -3444,6 +3489,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
     return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MI, get(X86::VXORPSYrr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr));
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
@@ -3557,14 +3609,16 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (i == 0) { // If operand 0
-    if (MI->getOpcode() == X86::MOV64r0)
-      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV32r0)
-      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV16r0)
-      NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV8r0)
-      NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+    unsigned Opc = 0;
+    switch (MI->getOpcode()) {
+    default: break;
+    case X86::MOV64r0: Opc = X86::MOV64mi32; break;
+    case X86::MOV32r0: Opc = X86::MOV32mi;   break;
+    case X86::MOV16r0: Opc = X86::MOV16mi;   break;
+    case X86::MOV8r0:  Opc = X86::MOV8mi;    break;
+    }
+    if (Opc)
+       NewMI = MakeM0Inst(*this, Opc, MOs, MI);
     if (NewMI)
       return NewMI;
 
@@ -3793,15 +3847,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
-    case X86::AVX_SET0PSY:
-    case X86::AVX_SET0PDY:
     case X86::AVX2_SETALLONES:
-    case X86::AVX2_SET0:
+    case X86::AVX_SET0:
       Alignment = 32;
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
-    case X86::AVX_SETALLONES:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -3837,11 +3888,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   switch (LoadMI->getOpcode()) {
   case X86::V_SET0:
   case X86::V_SETALLONES:
-  case X86::AVX_SET0PSY:
-  case X86::AVX_SET0PDY:
-  case X86::AVX_SETALLONES:
   case X86::AVX2_SETALLONES:
-  case X86::AVX2_SET0:
+  case X86::AVX_SET0:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
@@ -3873,15 +3921,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
     else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
-    else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY)
-      Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
-    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0)
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
-    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES ||
-                      Opc == X86::AVX2_SETALLONES);
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -3956,6 +4001,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
     OpcodeTablePtr = &RegOp2MemOpTable1;
   } else if (OpNum == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
   }
 
   if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index d293156..304676d 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -114,7 +114,7 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
 def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
 
 def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
-                            [SDNPHasChain]>;
+                            [SDNPHasChain,SDNPSideEffect]>;
 def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
                         [SDNPHasChain]>;
 def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
@@ -552,14 +552,21 @@ def HasMMX       : Predicate<"Subtarget->hasMMX()">;
 def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
 def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && Subtarget->hasNoAVX()">;
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && Subtarget->hasNoAVX()">;
 def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && Subtarget->hasNoAVX()">;
 def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && Subtarget->hasNoAVX()">;
 def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && Subtarget->hasNoAVX()">;
 def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && Subtarget->hasNoAVX()">;
 def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index c8f40bb..bd54858 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -118,11 +118,11 @@ let Constraints = "$src1 = $dst" in {
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId64, OpndItins itins> {
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
 
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
                      (IntId64 (bitconvert (memopmmx addr:$src))))],
@@ -134,11 +134,11 @@ let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
                              Intrinsic IntId64, OpndItins itins> {
   let isCommutable = 0 in
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
@@ -149,11 +149,11 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
 
 /// PALIGN MMX instructions (require SSSE3).
 multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
-  def R64irr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
       [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
-  def R64irm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
@@ -163,12 +163,10 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm, OpndItins itins, Domain d> {
-  def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        [(set DstRC:$dst, (Int SrcRC:$src))], 
-                        itins.rr, d>;
-  def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        [(set DstRC:$dst, (Int (ld_frag addr:$src)))], 
-                        itins.rm, d>;
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -243,29 +241,30 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         [(store (x86mmx VR64:$src), addr:$dst)],
                         IIC_MMX_MOVQ_RM>;
 
-def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                          (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
-                          [(set VR64:$dst,
-                            (x86mmx (bitconvert
-                            (i64 (vector_extract (v2i64 VR128:$src),
-                                  (iPTR 0))))))],
-                          IIC_MMX_MOVQ_RR>;
-
-def MMX_MOVQ2DQrr : S2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
-                            (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
-          [(set VR128:$dst,
-            (v2i64 (scalar_to_vector
-                              (i64 (bitconvert (x86mmx VR64:$src))))))],
-                           IIC_MMX_MOVQ_RR>;
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (vector_extract (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVQ2FR64rr: S2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
-                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
 
-def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
@@ -577,6 +576,7 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            IIC_MMX_MASKMOV>;
 
 // 64-bit bit convert.
+let Predicates = [HasSSE2] in {
 def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
@@ -585,5 +585,6 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
+}
 
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 220c06d..17e91a6 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -251,35 +251,37 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
 
 // A 128-bit subvector extract from the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
           (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
-def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
           (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
 
-def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
           (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
-def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
           (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
 
-def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
           (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
-def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
           (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
 
 // A 128-bit subvector insert to the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
 
 // Implicitly promote a 32-bit scalar to a vector.
 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
@@ -362,7 +364,7 @@ let Predicates = [HasAVX] in {
   def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
 }
 
-// Alias instructions that map fld0 to pxor for sse.
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1 in {
@@ -382,11 +384,11 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, neverHasSideEffects = 1 in {
-def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
+    isPseudo = 1 in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
@@ -394,35 +396,29 @@ def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
 
 
-// The same as done above but for AVX.  The 256-bit ISA does not support PI,
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
 // and doesn't need it because on sandy bridge the register is set to zero
 // at the rename stage without using any execution unit, so SET0PSY
 // and SET0PDY can be used for vector int instructions without penalty
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementatioan, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in {
-let Predicates = [HasAVX] in {
-def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
-}
-let Predicates = [HasAVX2], neverHasSideEffects = 1 in
-def AVX2_SET0   : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   []>, VEX_4V;
+    isPseudo = 1, Predicates = [HasAVX] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 5 in {
-  def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>;
+let Predicates = [HasAVX] in
+  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
 }
 
-// AVX has no support for 256-bit integer instructions, but since the 128-bit
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
 // VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
@@ -438,22 +434,17 @@ def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
 def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
 
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementation, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
-  let Predicates = [HasAVX] in
-  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
-  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+    isPseudo = 1 in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
   let Predicates = [HasAVX2] in
-  def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
-                          [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
 }
 
 
@@ -605,27 +596,27 @@ let Predicates = [HasAVX] in {
   // Represent the same patterns above but in the form they appear for
   // 256-bit types
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))),
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0),
                            (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
                            sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0),
                            (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                            sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))),
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
 
   // Move low f64 and clear high bits.
@@ -704,7 +695,7 @@ let Predicates = [HasAVX] in {
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
@@ -738,7 +729,7 @@ let Predicates = [HasSSE1] in {
             (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSD to the lower bits.
@@ -916,16 +907,16 @@ let isCodeGenOnly = 1 in {
 
 let Predicates = [HasAVX] in {
 def : Pat<(v8i32 (X86vzmovl
-                        (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4i64 (X86vzmovl
-                        (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v8f32 (X86vzmovl
-                        (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4f64 (X86vzmovl
-                        (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 }
 
@@ -975,10 +966,10 @@ let Predicates = [HasAVX] in {
             (VMOVUPDmr addr:$dst, VR128:$src)>;
 }
 
-let Predicates = [HasSSE1] in
+let Predicates = [UseSSE1] in
   def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
             (MOVUPSmr addr:$dst, VR128:$src)>;
-let Predicates = [HasSSE2] in
+let Predicates = [UseSSE2] in
   def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
             (MOVUPDmr addr:$dst, VR128:$src)>;
 
@@ -1028,12 +1019,52 @@ let Predicates = [HasAVX] in {
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v32i8 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
 // The instructions selected below are then converted to MOVDQA/MOVDQU
 // during the SSE domain pass.
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
@@ -1180,7 +1211,7 @@ let Predicates = [HasAVX] in {
             (VMOVLPDmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
   def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
                                  (iPTR 0))), addr:$src1),
@@ -1205,7 +1236,7 @@ let Predicates = [HasSSE1] in {
             (MOVLPSmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Shuffle with MOVLPD
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
             (MOVLPDrm VR128:$src1, addr:$src2)>;
@@ -1279,7 +1310,7 @@ let Predicates = [HasAVX] in {
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVHPS patterns
   def : Pat<(X86Movlhps VR128:$src1,
                  (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
@@ -1289,7 +1320,7 @@ let Predicates = [HasSSE1] in {
             (MOVHPSrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   // is during lowering, where it's not possible to recognize the load fold 
   // cause it has two uses through a bitcast. One use disappears at isel time
@@ -1346,7 +1377,7 @@ let Predicates = [HasAVX] in {
             (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (MOVLHPSrr VR128:$src1, VR128:$src2)>;
@@ -1456,7 +1487,7 @@ def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
 def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1633,7 +1664,7 @@ defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, SSE_CVT_PS>,
-                            TB, Requires<[HasSSE2]>;
+                            TB, Requires<[UseSSE2]>;
 
 /// SSE 2 Only
 
@@ -1663,7 +1694,7 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))],
                       IIC_SSE_CVT_Scalar_RM>,
                       XD,
-                  Requires<[HasSSE2, OptForSize]>;
+                  Requires<[UseSSE2, OptForSize]>;
 
 def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1684,13 +1715,13 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>;
 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>;
 }
 
 // Convert scalar single to scalar double
@@ -1709,30 +1740,28 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
 }
 
-let AddedComplexity = 1 in { // give AVX priority
-  def : Pat<(f64 (fextend FR32:$src)),
-            (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
-  def : Pat<(fextend (loadf32 addr:$src)),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(f64 (fextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
 
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
-            Requires<[HasAVX, OptForSpeed]>;
-} // AddedComplexity = 1
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[HasAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[HasAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (fextend FR32:$src))],
                    IIC_SSE_CVT_Scalar_RR>, XS,
-                 Requires<[HasSSE2]>;
+                 Requires<[UseSSE2]>;
 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (extloadf32 addr:$src))],
                    IIC_SSE_CVT_Scalar_RM>, XS,
-                 Requires<[HasSSE2, OptForSize]>;
+                 Requires<[UseSSE2, OptForSize]>;
 
 // extload f32 -> f64.  This matches load+fextend because we have a hack in
 // the isel (PreprocessForFPConvert) that can introduce loads after dag
@@ -1740,9 +1769,9 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
 // Since these loads aren't folded into the fextend, we have to match it
 // explicitly here.
 def : Pat<(fextend (loadf32 addr:$src)),
-          (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
 def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
 
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1762,13 +1791,13 @@ def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>;
 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>;
 }
 
 // Convert packed single/double fp to doubleword
@@ -1904,7 +1933,7 @@ let Predicates = [HasAVX] in {
             (VCVTTPS2DQYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (CVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -1978,10 +2007,10 @@ def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                      IIC_SSE_CVT_PD_RR>, TB, VEX;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX;
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, TB, VEX;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
@@ -1994,15 +2023,15 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      IIC_SSE_CVT_PD_RM>, TB, VEX;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                        IIC_SSE_CVT_PD_RR>, TB;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>, TB;
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, TB;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -2105,11 +2134,11 @@ let Predicates = [HasAVX] in {
             (VCVTPS2PDrr VR128:$src)>;
   def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
             (VCVTPS2PDYrr VR128:$src)>;
-  def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
             (VCVTPS2PDYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Match fextend for 128 conversions
   def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
             (CVTPS2PDrr VR128:$src)>;
@@ -2336,14 +2365,14 @@ def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
           (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
@@ -2420,7 +2449,7 @@ let Predicates = [HasAVX] in {
             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
                        (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
             (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
@@ -2428,7 +2457,7 @@ let Predicates = [HasSSE1] in {
             (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Generic SHUFPD patterns
   def : Pat<(v2i64 (X86Shufp VR128:$src1,
                        (memopv2i64 addr:$src2), (i8 imm:$imm))),
@@ -2500,7 +2529,27 @@ let Constraints = "$src1 = $dst" in {
                        SSEPackedDouble>, TB, OpSize;
 } // Constraints = "$src1 = $dst"
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2509,7 +2558,7 @@ let Predicates = [HasAVX], AddedComplexity = 1 in {
             (VUNPCKLPDrr VR128:$src, VR128:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2578,16 +2627,16 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -2683,14 +2732,12 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
 }
 
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
-let mayLoad = 0 in {
-  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
-                SSE_BIT_ITINS_P>;
-  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
-                SSE_BIT_ITINS_P>;
-  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
-                SSE_BIT_ITINS_P>;
-}
+defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+              SSE_BIT_ITINS_P>;
+defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+              SSE_BIT_ITINS_P>;
+defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+              SSE_BIT_ITINS_P>;
 
 let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
   defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
@@ -2794,27 +2841,23 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    SizeItins itins,
                                    bit Is2Addr = 1> {
-  let mayLoad = 0 in {
   defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
               v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>,
               TB;
   defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
               v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>,
               TB, OpSize;
-  }
 }
 
 multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
                                     SDNode OpNode,
                                     SizeItins itins> {
-  let mayLoad = 0 in {
-    defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
+  defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
                 v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
                 TB;
-    defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
+  defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
                 v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
                 TB, OpSize;
-  }
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
@@ -2924,7 +2967,7 @@ let Constraints = "$src1 = $dst" in {
   }
 }
 
-let isCommutable = 1, isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 in {
   defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
        VEX_4V, VEX_LIG;
   defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>,
@@ -2978,7 +3021,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
   def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[HasSSE1, OptForSize]>;
+            Requires<[UseSSE1, OptForSize]>;
   def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>;
@@ -2992,7 +3035,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
@@ -3000,6 +3043,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
                 (ins VR128:$src1, ssmem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -3062,7 +3106,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
   def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                 [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
-            Requires<[HasSSE2, OptForSize]>;
+            Requires<[UseSSE2, OptForSize]>;
   def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>;
@@ -3072,20 +3116,20 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
 }
 
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
+let hasSideEffects = 0 in
 multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
-  let neverHasSideEffects = 1 in {
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  }
   def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, sdmem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -3176,7 +3220,6 @@ let Predicates = [HasAVX] in {
                                     SSE_RCPP>, VEX;
 }
 
-let AddedComplexity = 1 in {
 def : Pat<(f32 (fsqrt FR32:$src)),
           (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
 def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -3199,9 +3242,8 @@ def : Pat<(f32 (X86frcp FR32:$src)),
 def : Pat<(f32 (X86frcp (load addr:$src))),
           (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
           Requires<[HasAVX, OptForSize]>;
-}
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3322,7 +3364,7 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     IIC_SSE_MOVNT>;
 
 def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
 
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
@@ -3482,7 +3524,7 @@ def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
-                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 
 // For Disassembler
 let isCodeGenOnly = 1 in {
@@ -3492,7 +3534,7 @@ def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
@@ -3504,7 +3546,7 @@ def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
                    IIC_SSE_MOVU_P_RM>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 let mayStore = 1 in {
@@ -3516,7 +3558,7 @@ def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(store (v2i64 VR128:$src), addr:$dst)*/],
                    IIC_SSE_MOVU_P_MR>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 // Intrinsic forms of MOVDQU load and store
@@ -3530,7 +3572,7 @@ def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
                        IIC_SSE_MOVU_P_MR>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4028,7 +4070,7 @@ let Predicates = [HasAVX2] in {
             (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
             (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
@@ -4210,7 +4252,7 @@ let Predicates = [HasAVX2] in {
   defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
  let AddedComplexity = 5 in
   defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize;
 
@@ -4325,28 +4367,6 @@ let Constraints = "$src1 = $dst" in {
 }
 } // ExeDomain = SSEPackedInt
 
-// Patterns for using AVX1 instructions with integer vectors
-// Here to give AVX2 priority
-let Predicates = [HasAVX] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
-
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
-}
-
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Extract and Insert
 //===---------------------------------------------------------------------===//
@@ -4395,7 +4415,7 @@ let Predicates = [HasAVX] in {
 }
 
 let Constraints = "$src1 = $dst" in
-  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>;
+  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4556,7 +4576,7 @@ def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          "vmov{d|q}\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
@@ -4672,14 +4692,14 @@ let Predicates = [HasAVX] in {
   }
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                                (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))),
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                                (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))),
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -4719,7 +4739,7 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
                       IIC_SSE_MOVDQ>, XS,
-                    Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
 
 //===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
@@ -4762,7 +4782,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 let Predicates = [HasAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
@@ -4773,7 +4793,7 @@ let Predicates = [HasAVX], AddedComplexity = 20 in {
             (VMOVZQI2PQIrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
@@ -4803,7 +4823,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -4818,7 +4838,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))],
                                              IIC_SSE_MOVDQ>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 }
 
 let AddedComplexity = 20 in {
@@ -4828,7 +4848,7 @@ let AddedComplexity = 20 in {
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (VMOVZPQILo2PQIrr VR128:$src)>;
   }
-  let Predicates = [HasSSE2] in {
+  let Predicates = [UseSSE2] in {
     def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
               (MOVZPQILo2PQIrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
@@ -4908,7 +4928,7 @@ let Predicates = [HasAVX] in {
             (VMOVSLDUPYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -4977,7 +4997,7 @@ let Predicates = [HasAVX] in {
             (VMOVDDUPYrr VR256:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
   def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
@@ -5041,7 +5061,7 @@ let Predicates = [HasAVX] in {
                                  f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
   }
 }
-let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in {
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
                               f128mem, SSE_ALU_F32P>, TB, XD;
@@ -5424,7 +5444,7 @@ let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Predicates = [HasAVX2] in
   defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V;
-let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGN : ssse3_palign<"palignr">;
 
 let Predicates = [HasAVX2] in {
@@ -5449,7 +5469,7 @@ def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
-let Predicates = [HasSSSE3] in {
+let Predicates = [UseSSSE3] in {
 def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
@@ -5583,7 +5603,7 @@ let Predicates = [HasAVX] in {
             (VPMOVZXDQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load.
   def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
             (PMOVSXBWrm addr:$src)>;
@@ -5633,7 +5653,7 @@ let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
 }
@@ -5704,7 +5724,7 @@ let Predicates = [HasAVX] in {
             (VPMOVZXWQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
             (PMOVSXBDrm addr:$src)>;
@@ -5772,7 +5792,7 @@ let Predicates = [HasAVX] in {
             (VPMOVZXBQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbq
               (bitconvert (v4i32 (X86vzmovl
@@ -5918,7 +5938,7 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
                                               imm:$src2))),
                  addr:$dst),
           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
-          Requires<[HasSSE41]>;
+          Requires<[UseSSE41]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Insert Instructions
@@ -6190,6 +6210,15 @@ let Predicates = [HasAVX] in {
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x1))>;
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x1))>;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6199,26 +6228,33 @@ let Constraints = "$src1 = $dst" in
 defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
 
-def : Pat<(ffloor FR32:$src),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
-def : Pat<(f64 (ffloor FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
-def : Pat<(f32 (fnearbyint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
-def : Pat<(f64 (fnearbyint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
-def : Pat<(f32 (fceil FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
-def : Pat<(f64 (fceil FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
-def : Pat<(f32 (frint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
-def : Pat<(f64 (frint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
-def : Pat<(f32 (ftrunc FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
-def : Pat<(f64 (ftrunc FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x1))>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
@@ -6356,7 +6392,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
 }
 
-/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
                                 Intrinsic IntId256> {
   let isCommutable = 1 in
@@ -6705,7 +6741,7 @@ def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
                 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
                             (v16i8 VR128:$src2))),
             (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
@@ -6802,9 +6838,8 @@ multiclass pseudo_pcmpistrm<string asm> {
 }
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
-  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
 }
 
 let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
@@ -6840,9 +6875,8 @@ multiclass pseudo_pcmpestrm<string asm> {
 }
 
 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
-  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
 }
 
 let Predicates = [HasAVX],
@@ -7237,40 +7271,59 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
 
 let Predicates = [HasAVX] in {
 def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
@@ -7290,56 +7343,61 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           []>, VEX;
 }
 
-// Extract and store.
-let Predicates = [HasAVX] in {
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-
-  def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-}
-
 // AVX1 patterns
 let Predicates = [HasAVX] in {
-def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4f32 (VEXTRACTF128rr
                     (v8f32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2f64 (VEXTRACTF128rr
                     (v4f64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+
+def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v4i64 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v8i32 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTF128rr
-                    (v16i16 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v16i16 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTF128rr
-                    (v32i8 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+                  (v32i8 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7456,29 +7514,29 @@ def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
 }
 
 let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
-def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1,
-                  (memopv8f32 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
                   (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
                   (memopv4i64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
                   (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
@@ -7665,19 +7723,22 @@ let Predicates = [HasAVX2] in {
 }
 
 // AVX1 broadcast patterns
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
 def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
 
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
@@ -7757,7 +7818,6 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
-let AddedComplexity = 1 in {
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -7768,9 +7828,8 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V;
-}
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -7805,23 +7864,43 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           []>, VEX_4V;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7838,23 +7917,40 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX;
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+let Predicates = [HasAVX2] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTI128rr
                     (v4i64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTI128rr
                     (v8i32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTI128rr
                     (v16i16 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTI128rr
                     (v32i8 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 7ac4cec..764aa5d 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -532,7 +532,7 @@ uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
 #endif
 }
 
-template<typename T> void addUnaligned(void *Pos, T Delta) {
+template<typename T> static void addUnaligned(void *Pos, T Delta) {
   T Value;
   std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
               sizeof(T));
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 9c0ce4e..1c2ef25 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -377,12 +377,6 @@ ReSimplify:
   case X86::SETB_C64r:    LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
   case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
-  case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
-  case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
-  case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
-  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
-  case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break;
-  case X86::AVX2_SET0:     LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 877b8f6..3b4cfc4 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -522,7 +522,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, RegScavenger *RS) const{
+                                     int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 9087852..0d7b664 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -346,6 +346,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasVectorUAMem(false)
   , HasCmpxchg16b(false)
   , UseLeaForSP(false)
+  , HasSlowDivide(false)
   , PostRAScheduler(false)
   , stackAlignment(4)
   // FIXME: this is a known good value for Yonah. How about others?
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 6841c5b..dde7e24 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -136,6 +136,10 @@ protected:
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
+  /// HasSlowDivide - True if smaller divides are significantly faster than
+  /// full divides and should be used when possible.
+  bool HasSlowDivide;
+
   /// PostRAScheduler - True if using post-register-allocation scheduler.
   bool PostRAScheduler;
 
@@ -198,6 +202,7 @@ public:
   bool hasSSE42() const { return X86SSELevel >= SSE42; }
   bool hasAVX() const { return X86SSELevel >= AVX; }
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
+  bool hasNoAVX() const { return X86SSELevel < AVX; }
   bool hasSSE4A() const { return HasSSE4A; }
   bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
   bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
@@ -205,7 +210,8 @@ public:
   bool hasAES() const { return HasAES; }
   bool hasPCLMUL() const { return HasPCLMUL; }
   bool hasFMA() const { return HasFMA; }
-  bool hasFMA4() const { return HasFMA4; }
+  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
+  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
   bool hasXOP() const { return HasXOP; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
@@ -219,6 +225,7 @@ public:
   bool hasVectorUAMem() const { return HasVectorUAMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasSlowDivide() const { return HasSlowDivide; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
 
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 80b75dc..449eed3 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -42,7 +42,6 @@ namespace {
 
   private:
     const TargetInstrInfo *TII; // Machine instruction info.
-    MachineBasicBlock *MBB;     // Current basic block
 
     // Any YMM register live-in to this function?
     bool FnHasLiveInYmm;
@@ -84,7 +83,7 @@ namespace {
     //  2) All states must be clean for the result to be clean
     //  3) If none above and one unknown, the result state is also unknown
     //
-    unsigned computeState(unsigned PrevState, unsigned CurState) {
+    static unsigned computeState(unsigned PrevState, unsigned CurState) {
       if (PrevState == ST_INIT)
         return CurState;
 
@@ -122,7 +121,7 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
 }
 
 static bool hasYmmReg(MachineInstr *MI) {
-  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg())
       continue;
@@ -189,7 +188,6 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
                                            MachineBasicBlock &BB) {
   bool Changed = false;
   unsigned BBNum = BB.getNumber();
-  MBB = &BB;
 
   // Don't process already solved BBs
   if (BBSolved[BBNum])
@@ -207,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
 
   // The entry MBB for the function may set the initial state to dirty if
   // the function receives any YMM incoming arguments
-  if (MBB == MF.begin()) {
+  if (&BB == MF.begin()) {
     EntryState = ST_CLEAN;
     if (FnHasLiveInYmm)
       EntryState = ST_DIRTY;
@@ -253,7 +251,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
       // When unknown, only compute the information within the block to have
       // it available in the exit if possible, but don't change the block.
       if (EntryState != ST_UNKNOWN) {
-        BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
+        BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
         ++NumVZU;
       }
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index ae646a2..3e7666b 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -33,7 +33,7 @@ def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
                              SDNPVariadic]>;
 
 def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTBrind,
-                         [SDNPHasChain, SDNPOptInGlue]>;
+                         [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad]>;
 
 def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
@@ -58,7 +58,7 @@ def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
 
 def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
 def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPMayStore]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index cdd0a08..be5855a 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -176,7 +176,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   #ifndef NDEBUG
   DEBUG(errs() << "\nFunction         : " 
-        << MF.getFunction()->getName() << "\n");
+        << MF.getName() << "\n");
   DEBUG(errs() << "<--------->\n");
   DEBUG(MI.print(errs()));
   DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
author	Stephen Hines <srhines@google.com>	2012-09-13 19:09:19 -0700
committer	Android Git Automerger <android-git-automerger@android.com>	2012-09-13 19:09:19 -0700
commit	78c041bd883d86c81c42b98f326660277e6d0d9a (patch)
tree	52800183ec2d22164b8f396842142c3a8aab912a /lib/Target
parent	828ded66831c0caaeecd2291a6bfb084f373d0e4 (diff)
parent	1c4ad5ef4fab105f0c8af7edd026e00502fb6279 (diff)
download	external_llvm-78c041bd883d86c81c42b98f326660277e6d0d9a.zip external_llvm-78c041bd883d86c81c42b98f326660277e6d0d9a.tar.gz external_llvm-78c041bd883d86c81c42b98f326660277e6d0d9a.tar.bz2